Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into acc_image_proc

9d2f49c6 · dangqingqing · ae06debf · f93af824 · 9d2f49c6 · 9d2f49c6
153 changed file
--- a/.dockerignore
+++ b/.dockerignore
+.gitignore
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ build/
 .cproject
 .pydevproject
 Makefile
+.test_env/
--- a/.travis.yml
+++ b/.travis.yml
@@ -50,7 +50,7 @@ before_install:
    fi
  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
+  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy sphinx_rtd_theme
 script:
  - paddle/scripts/travis/main.sh
 notifications:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 2.8)

 project(paddle CXX C)
-set(PADDLE_MAJOR_VERSION 0)
-set(PADDLE_MINOR_VERSION 9)
-set(PADDLE_PATCH_VERSION 0)
-set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})

 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
@@ -12,6 +8,17 @@ include(package)
 find_package(SWIG 2.0)
 find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
+
+# Check protobuf library version.
+execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
+	OUTPUT_VARIABLE PROTOBUF_VERSION)
+string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
+
+set(PROTOBUF_3 OFF)
+if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
+    set(PROTOBUF_3 ON)
+endif()
+
 find_package(PythonLibs 2.7 REQUIRED)
 find_package(PythonInterp 2.7 REQUIRED)
 find_package(ZLIB REQUIRED)
@@ -45,7 +52,7 @@ option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
 option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)

 if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
+    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
        FORCE)
 endif()
@@ -64,31 +71,11 @@ include(check_packages)
 include(swig)
 include(coveralls)

-# add PaddlePaddle version
-if(DEFINED ENV{PADDLE_VERSION})
-    add_definitions(-DPADDLE_VERSION=\"$ENV{PADDLE_VERSION}\")
-else()
-    if(EXISTS ${PROJ_ROOT}/.svn/)
-        find_package(Subversion REQUIRED)
-        if(SUBVERSION_FOUND)
-            Subversion_WC_INFO(${PROJ_ROOT} Project)
-            add_definitions(-DPADDLE_VERSION=${Project_WC_REVISION})
-        endif()
-    elseif(EXISTS ${PROJ_ROOT}/.git/)
-        find_package(Git REQUIRED)
-        execute_process(
-            COMMAND ${GIT_EXECUTABLE} log -1 --format=%H
-            WORKING_DIRECTORY ${PROJ_ROOT}
-            OUTPUT_VARIABLE GIT_SHA1
-            RESULT_VARIABLE GIT_RESULT
-            ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if(NOT ${GIT_RESULT})
-            add_definitions(-DPADDLE_VERSION=\"${GIT_SHA1}\")
-        else()
-            message(WARNING "Cannot add paddle version from git tag")
-        endif()
-    endif()
-endif()
+# Set PaddlePaddle version to Git tag name or Git commit ID.
+find_package(Git REQUIRED)
+# version.cmake will get the current PADDLE_VERSION
+include(version)
+add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\")


 if(NOT WITH_GPU)

--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
+paddle/image/logs
+paddle/image/*.pyc
+paddle/image/train.list
+paddle/rnn/logs
+paddle/rnn/*.pyc
+paddle/rnn/imdb.pkl
+caffe/image/logs
+tensorflow/image/logs
+tensorflow/rnn/logs
--- a/benchmark/README.md
+++ b/benchmark/README.md
+# Benchmark
+
+Machine: 
+
+- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
+- GPU: Tesla K40m
+- cuDNN: v5.1
+- system: Docker 1.12.1, all platforms are tested in docker environment.
+
+Platforms: 
+
+- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0 
+- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu 
+- Caffe: kaixhin/cuda-caffe
+
+Several convolutional neural networks and recurrent neural networks are used to test.
+
+## Image
+
+### Benchmark Model
+
+AlexNet, GoogleNet and a small network used in Caffe.
+
+- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
+
+- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
+
+- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
+
+
+### Single-GPU
+
+- AlexNet:  input - 3 * 227 * 227,  Time: ms/batch
+
+| BatchSize    | 64  | 128  | 256   | 512  |
+|--------------|-----| -----| ------| -----|
+| PaddlePaddle | 195 | 334  | 602   | 1629 |
+| TensorFlow   | 223 | 364  | 645   | 1235 |
+| Caffe        | 324 | 627  | 1232  | 2513 |
+ 
+**Notation**
+
+All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
+ 
+- GoogletNet:  input - 3 * 224 * 224, Time: ms/batch
+
+
+| BatchSize    | 64    |   128  | 256     |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 613   | 1149   | 2348    |
+| TensorFlow   | 644   | 1176   | 2219    |
+| Caffe        | 694   | 1364   | out of memory   |
+
+- SmallNet: input - 3 * 32 * 32, Time ms/batch
+
+| BatchSize    | 64     |   128    | 256     | 512     |
+|--------------|--------| -------- | --------|---------|
+| PaddlePaddle | 10.463 | 18.184   | 33.113  |  63.039 |
+| TensorFlow   | 9     | 15       | 28      | 59       |
+| Caffe        | 9.373  | 16.6606  | 31.4797 | 59.719  |
+
+**Notation**
+
+All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
+
+In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
+
+### Multi-GPU: 4 GPUs
+
+- AlexNet,  ms / batch
+
+| total-BatchSize | 128 * 4  | 256 * 4    |
+|------------------|----------| -----------|
+| PaddlePaddle     | 347      | 622        |
+| TensorFlow       | 377      | 675        |
+| Caffe            | 1229     | 2435       |
+
+For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by 
+
+```
+  time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512 
+= (334 * 4)/347 
+= 3.85
+``` 
+
+<img src="figs/alexnet-4gpu.png" width="420">
+
+
+- GoogleNet, ms / batch
+
+| total-BatchSize  | 128 * 4      |  256 * 4    |
+|-------------------|--------------| ----------- |
+| PaddlePaddle      | 1178         | 2367        |
+| TensorFlow        | 1210         | 2292        |
+| Caffe             | 2007         | out of memory  |
+
+<img src="figs/googlenet-4gpu.png" width="420">
+
+
+## RNN
+We use lstm network for text classfication to test benchmark.
+
+### Dataset
+-  [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
+- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
+- Dictionary size=30000 
+- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
+
+### Single-GPU
+
+#### LSTM in Text Classification
+
+Testing `2 lstm layer + fc` network with different hidden size and batch size.
+  
+- Batch size = 64, ms / batch
+ 
+| hidden_size  | 256   | 512    |  1280   |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 83    | 184    | 641     |
+| TensorFlow   | 175   | 280    | 818     |
+
+- Batch size = 128, ms / batch
+ 
+| hidden_size  | 256    | 512    |  1280   |
+|--------------|------- | -------| --------|
+| PaddlePaddle | 110    | 261    | 1007    |
+| TensorFlow   | 181    | 361    | 1237    |
+
+
+- Batch size = 256, ms / batch
+ 
+| hidden_size  | 256   | 512    |  1280   |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 170   | 414    | 1655    |
+| TensorFlow   | 238   | 536    | 1905    |
+
+<img src="figs/rnn_lstm_cls.png" width="600">
+
+#### Seq2Seq
+
+The benchmark of sequence-to-sequence network will be added later.
+ 
+
+### Multi GPU: 4 GPUs
+
+#### LSTM in Text Classification
+
+- hidden_size = 256, ms / batch
+ 
+| batch_size   | 256    |  512    |
+|--------------| -------| --------|
+| PaddlePaddle | 90     | 118     |
+| TensorFlow   | 226    | 118     |
+
+
+- hidden_size = 512, ms / batch
+ 
+| batch_size   | 256    |  512    |
+|--------------| -------| --------|
+| PaddlePaddle | 189    | 268     |
+| TensorFlow   | 297    | 383     |
+
+
+<img src="figs/rnn_lstm_4gpus.png" width="420">
+
+#### Seq2Seq
+
+The benchmark of sequence-to-sequence network will be added later.
--- a/benchmark/caffe/image/alexnet.prototxt
+++ b/benchmark/caffe/image/alexnet.prototxt
+name: "alexnet"
+input: "data"
+input_dim: 64
+input_dim: 3
+input_dim: 227
+input_dim: 227
+input: "label"
+input_dim: 64
+input_dim: 1
+input_dim: 1
+input_dim: 1 
+force_backward: true
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
--- a/benchmark/caffe/image/googlenet.prototxt
+++ b/benchmark/caffe/image/googlenet.prototxt
+name: "googlenet"
+input: "data"
+input_dim: 128
+input_dim: 3
+input_dim: 224
+input_dim: 224
+input: "label"
+input_dim: 128
+input_dim: 1
+input_dim: 1
+input_dim: 1 
+layer {
+  name: "conv1/7x7_s2"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1/7x7_s2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 3
+    kernel_size: 7
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "conv1/relu_7x7"
+  type: "ReLU"
+  bottom: "conv1/7x7_s2"
+  top: "conv1/7x7_s2"
+}
+layer {
+  name: "pool1/3x3_s2"
+  type: "Pooling"
+  bottom: "conv1/7x7_s2"
+  top: "pool1/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+#layer {
+#  name: "pool1/norm1"
+#  type: "LRN"
+#  bottom: "pool1/3x3_s2"
+#  top: "pool1/norm1"
+#  lrn_param {
+#    local_size: 5
+#    alpha: 0.0001
+#    beta: 0.75
+#  }
+#}
+layer {
+  name: "conv2/3x3_reduce"
+  type: "Convolution"
+#  bottom: "pool1/norm1"
+  bottom: "pool1/3x3_s2"
+  top: "conv2/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "conv2/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "conv2/3x3_reduce"
+  top: "conv2/3x3_reduce"
+}
+layer {
+  name: "conv2/3x3"
+  type: "Convolution"
+  bottom: "conv2/3x3_reduce"
+  top: "conv2/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "conv2/relu_3x3"
+  type: "ReLU"
+  bottom: "conv2/3x3"
+  top: "conv2/3x3"
+}
+#layer {
+#  name: "conv2/norm2"
+#  type: "LRN"
+#  bottom: "conv2/3x3"
+#  top: "conv2/norm2"
+#  lrn_param {
+#    local_size: 5
+#    alpha: 0.0001
+#    beta: 0.75
+#  }
+#}
+layer {
+  name: "pool2/3x3_s2"
+  type: "Pooling"
+#  bottom: "conv2/norm2"
+  bottom: "conv2/3x3"
+  top: "pool2/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "inception_3a/1x1"
+  type: "Convolution"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_3a/1x1"
+  top: "inception_3a/1x1"
+}
+layer {
+  name: "inception_3a/3x3_reduce"
+  type: "Convolution"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_3a/3x3_reduce"
+  top: "inception_3a/3x3_reduce"
+}
+layer {
+  name: "inception_3a/3x3"
+  type: "Convolution"
+  bottom: "inception_3a/3x3_reduce"
+  top: "inception_3a/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_3a/3x3"
+  top: "inception_3a/3x3"
+}
+layer {
+  name: "inception_3a/5x5_reduce"
+  type: "Convolution"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_3a/5x5_reduce"
+  top: "inception_3a/5x5_reduce"
+}
+layer {
+  name: "inception_3a/5x5"
+  type: "Convolution"
+  bottom: "inception_3a/5x5_reduce"
+  top: "inception_3a/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_3a/5x5"
+  top: "inception_3a/5x5"
+}
+layer {
+  name: "inception_3a/pool"
+  type: "Pooling"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_3a/pool_proj"
+  type: "Convolution"
+  bottom: "inception_3a/pool"
+  top: "inception_3a/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_3a/pool_proj"
+  top: "inception_3a/pool_proj"
+}
+layer {
+  name: "inception_3a/output"
+  type: "Concat"
+  bottom: "inception_3a/1x1"
+  bottom: "inception_3a/3x3"
+  bottom: "inception_3a/5x5"
+  bottom: "inception_3a/pool_proj"
+  top: "inception_3a/output"
+}
+layer {
+  name: "inception_3b/1x1"
+  type: "Convolution"
+  bottom: "inception_3a/output"
+  top: "inception_3b/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_3b/1x1"
+  top: "inception_3b/1x1"
+}
+layer {
+  name: "inception_3b/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_3a/output"
+  top: "inception_3b/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_3b/3x3_reduce"
+  top: "inception_3b/3x3_reduce"
+}
+layer {
+  name: "inception_3b/3x3"
+  type: "Convolution"
+  bottom: "inception_3b/3x3_reduce"
+  top: "inception_3b/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_3b/3x3"
+  top: "inception_3b/3x3"
+}
+layer {
+  name: "inception_3b/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_3a/output"
+  top: "inception_3b/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_3b/5x5_reduce"
+  top: "inception_3b/5x5_reduce"
+}
+layer {
+  name: "inception_3b/5x5"
+  type: "Convolution"
+  bottom: "inception_3b/5x5_reduce"
+  top: "inception_3b/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_3b/5x5"
+  top: "inception_3b/5x5"
+}
+layer {
+  name: "inception_3b/pool"
+  type: "Pooling"
+  bottom: "inception_3a/output"
+  top: "inception_3b/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_3b/pool_proj"
+  type: "Convolution"
+  bottom: "inception_3b/pool"
+  top: "inception_3b/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_3b/pool_proj"
+  top: "inception_3b/pool_proj"
+}
+layer {
+  name: "inception_3b/output"
+  type: "Concat"
+  bottom: "inception_3b/1x1"
+  bottom: "inception_3b/3x3"
+  bottom: "inception_3b/5x5"
+  bottom: "inception_3b/pool_proj"
+  top: "inception_3b/output"
+}
+layer {
+  name: "pool3/3x3_s2"
+  type: "Pooling"
+  bottom: "inception_3b/output"
+  top: "pool3/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "inception_4a/1x1"
+  type: "Convolution"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4a/1x1"
+  top: "inception_4a/1x1"
+}
+layer {
+  name: "inception_4a/3x3_reduce"
+  type: "Convolution"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4a/3x3_reduce"
+  top: "inception_4a/3x3_reduce"
+}
+layer {
+  name: "inception_4a/3x3"
+  type: "Convolution"
+  bottom: "inception_4a/3x3_reduce"
+  top: "inception_4a/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 208
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4a/3x3"
+  top: "inception_4a/3x3"
+}
+layer {
+  name: "inception_4a/5x5_reduce"
+  type: "Convolution"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4a/5x5_reduce"
+  top: "inception_4a/5x5_reduce"
+}
+layer {
+  name: "inception_4a/5x5"
+  type: "Convolution"
+  bottom: "inception_4a/5x5_reduce"
+  top: "inception_4a/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 48
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4a/5x5"
+  top: "inception_4a/5x5"
+}
+layer {
+  name: "inception_4a/pool"
+  type: "Pooling"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4a/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4a/pool"
+  top: "inception_4a/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4a/pool_proj"
+  top: "inception_4a/pool_proj"
+}
+layer {
+  name: "inception_4a/output"
+  type: "Concat"
+  bottom: "inception_4a/1x1"
+  bottom: "inception_4a/3x3"
+  bottom: "inception_4a/5x5"
+  bottom: "inception_4a/pool_proj"
+  top: "inception_4a/output"
+}
+#layer {
+#  name: "loss1/ave_pool"
+#  type: "Pooling"
+#  bottom: "inception_4a/output"
+#  top: "loss1/ave_pool"
+#  pooling_param {
+#    pool: AVE
+#    kernel_size: 5
+#    stride: 3
+#  }
+#}
+#layer {
+#  name: "loss1/conv"
+#  type: "Convolution"
+#  bottom: "loss1/ave_pool"
+#  top: "loss1/conv"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  convolution_param {
+#    num_output: 128
+#    kernel_size: 1
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss1/relu_conv"
+#  type: "ReLU"
+#  bottom: "loss1/conv"
+#  top: "loss1/conv"
+#}
+#layer {
+#  name: "loss1/fc"
+#  type: "InnerProduct"
+#  bottom: "loss1/conv"
+#  top: "loss1/fc"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1024
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss1/relu_fc"
+#  type: "ReLU"
+#  bottom: "loss1/fc"
+#  top: "loss1/fc"
+#}
+#layer {
+#  name: "loss1/drop_fc"
+#  type: "Dropout"
+#  bottom: "loss1/fc"
+#  top: "loss1/fc"
+#  dropout_param {
+#    dropout_ratio: 0.7
+#  }
+#}
+#layer {
+#  name: "loss1/classifier"
+#  type: "InnerProduct"
+#  bottom: "loss1/fc"
+#  top: "loss1/classifier"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1000
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0
+#    }
+#  }
+#}
+#layer {
+#  name: "loss1/loss"
+#  type: "SoftmaxWithLoss"
+#  bottom: "loss1/classifier"
+#  bottom: "label"
+#  top: "loss1/loss1"
+#  loss_weight: 0.3
+#}
+layer {
+  name: "inception_4b/1x1"
+  type: "Convolution"
+  bottom: "inception_4a/output"
+  top: "inception_4b/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 160
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4b/1x1"
+  top: "inception_4b/1x1"
+}
+layer {
+  name: "inception_4b/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4a/output"
+  top: "inception_4b/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 112
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4b/3x3_reduce"
+  top: "inception_4b/3x3_reduce"
+}
+layer {
+  name: "inception_4b/3x3"
+  type: "Convolution"
+  bottom: "inception_4b/3x3_reduce"
+  top: "inception_4b/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 224
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4b/3x3"
+  top: "inception_4b/3x3"
+}
+layer {
+  name: "inception_4b/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4a/output"
+  top: "inception_4b/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4b/5x5_reduce"
+  top: "inception_4b/5x5_reduce"
+}
+layer {
+  name: "inception_4b/5x5"
+  type: "Convolution"
+  bottom: "inception_4b/5x5_reduce"
+  top: "inception_4b/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4b/5x5"
+  top: "inception_4b/5x5"
+}
+layer {
+  name: "inception_4b/pool"
+  type: "Pooling"
+  bottom: "inception_4a/output"
+  top: "inception_4b/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4b/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4b/pool"
+  top: "inception_4b/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4b/pool_proj"
+  top: "inception_4b/pool_proj"
+}
+layer {
+  name: "inception_4b/output"
+  type: "Concat"
+  bottom: "inception_4b/1x1"
+  bottom: "inception_4b/3x3"
+  bottom: "inception_4b/5x5"
+  bottom: "inception_4b/pool_proj"
+  top: "inception_4b/output"
+}
+layer {
+  name: "inception_4c/1x1"
+  type: "Convolution"
+  bottom: "inception_4b/output"
+  top: "inception_4c/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4c/1x1"
+  top: "inception_4c/1x1"
+}
+layer {
+  name: "inception_4c/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4b/output"
+  top: "inception_4c/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4c/3x3_reduce"
+  top: "inception_4c/3x3_reduce"
+}
+layer {
+  name: "inception_4c/3x3"
+  type: "Convolution"
+  bottom: "inception_4c/3x3_reduce"
+  top: "inception_4c/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4c/3x3"
+  top: "inception_4c/3x3"
+}
+layer {
+  name: "inception_4c/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4b/output"
+  top: "inception_4c/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4c/5x5_reduce"
+  top: "inception_4c/5x5_reduce"
+}
+layer {
+  name: "inception_4c/5x5"
+  type: "Convolution"
+  bottom: "inception_4c/5x5_reduce"
+  top: "inception_4c/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4c/5x5"
+  top: "inception_4c/5x5"
+}
+layer {
+  name: "inception_4c/pool"
+  type: "Pooling"
+  bottom: "inception_4b/output"
+  top: "inception_4c/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4c/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4c/pool"
+  top: "inception_4c/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4c/pool_proj"
+  top: "inception_4c/pool_proj"
+}
+layer {
+  name: "inception_4c/output"
+  type: "Concat"
+  bottom: "inception_4c/1x1"
+  bottom: "inception_4c/3x3"
+  bottom: "inception_4c/5x5"
+  bottom: "inception_4c/pool_proj"
+  top: "inception_4c/output"
+}
+layer {
+  name: "inception_4d/1x1"
+  type: "Convolution"
+  bottom: "inception_4c/output"
+  top: "inception_4d/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 112
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4d/1x1"
+  top: "inception_4d/1x1"
+}
+layer {
+  name: "inception_4d/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4c/output"
+  top: "inception_4d/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 144
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4d/3x3_reduce"
+  top: "inception_4d/3x3_reduce"
+}
+layer {
+  name: "inception_4d/3x3"
+  type: "Convolution"
+  bottom: "inception_4d/3x3_reduce"
+  top: "inception_4d/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 288
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4d/3x3"
+  top: "inception_4d/3x3"
+}
+layer {
+  name: "inception_4d/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4c/output"
+  top: "inception_4d/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4d/5x5_reduce"
+  top: "inception_4d/5x5_reduce"
+}
+layer {
+  name: "inception_4d/5x5"
+  type: "Convolution"
+  bottom: "inception_4d/5x5_reduce"
+  top: "inception_4d/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4d/5x5"
+  top: "inception_4d/5x5"
+}
+layer {
+  name: "inception_4d/pool"
+  type: "Pooling"
+  bottom: "inception_4c/output"
+  top: "inception_4d/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4d/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4d/pool"
+  top: "inception_4d/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4d/pool_proj"
+  top: "inception_4d/pool_proj"
+}
+layer {
+  name: "inception_4d/output"
+  type: "Concat"
+  bottom: "inception_4d/1x1"
+  bottom: "inception_4d/3x3"
+  bottom: "inception_4d/5x5"
+  bottom: "inception_4d/pool_proj"
+  top: "inception_4d/output"
+}
+#layer {
+#  name: "loss2/ave_pool"
+#  type: "Pooling"
+#  bottom: "inception_4d/output"
+#  top: "loss2/ave_pool"
+#  pooling_param {
+#    pool: AVE
+#    kernel_size: 5
+#    stride: 3
+#  }
+#}
+#layer {
+#  name: "loss2/conv"
+#  type: "Convolution"
+#  bottom: "loss2/ave_pool"
+#  top: "loss2/conv"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  convolution_param {
+#    num_output: 128
+#    kernel_size: 1
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss2/relu_conv"
+#  type: "ReLU"
+#  bottom: "loss2/conv"
+#  top: "loss2/conv"
+#}
+#layer {
+#  name: "loss2/fc"
+#  type: "InnerProduct"
+#  bottom: "loss2/conv"
+#  top: "loss2/fc"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1024
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss2/relu_fc"
+#  type: "ReLU"
+#  bottom: "loss2/fc"
+#  top: "loss2/fc"
+#}
+#layer {
+#  name: "loss2/drop_fc"
+#  type: "Dropout"
+#  bottom: "loss2/fc"
+#  top: "loss2/fc"
+#  dropout_param {
+#    dropout_ratio: 0.7
+#  }
+#}
+#layer {
+#  name: "loss2/classifier"
+#  type: "InnerProduct"
+#  bottom: "loss2/fc"
+#  top: "loss2/classifier"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1000
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0
+#    }
+#  }
+#}
+#layer {
+#  name: "loss2/loss"
+#  type: "SoftmaxWithLoss"
+#  bottom: "loss2/classifier"
+#  bottom: "label"
+#  top: "loss2/loss1"
+#  loss_weight: 0.3
+#}
+layer {
+  name: "inception_4e/1x1"
+  type: "Convolution"
+  bottom: "inception_4d/output"
+  top: "inception_4e/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4e/1x1"
+  top: "inception_4e/1x1"
+}
+layer {
+  name: "inception_4e/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4d/output"
+  top: "inception_4e/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 160
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4e/3x3_reduce"
+  top: "inception_4e/3x3_reduce"
+}
+layer {
+  name: "inception_4e/3x3"
+  type: "Convolution"
+  bottom: "inception_4e/3x3_reduce"
+  top: "inception_4e/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 320
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4e/3x3"
+  top: "inception_4e/3x3"
+}
+layer {
+  name: "inception_4e/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4d/output"
+  top: "inception_4e/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4e/5x5_reduce"
+  top: "inception_4e/5x5_reduce"
+}
+layer {
+  name: "inception_4e/5x5"
+  type: "Convolution"
+  bottom: "inception_4e/5x5_reduce"
+  top: "inception_4e/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4e/5x5"
+  top: "inception_4e/5x5"
+}
+layer {
+  name: "inception_4e/pool"
+  type: "Pooling"
+  bottom: "inception_4d/output"
+  top: "inception_4e/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4e/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4e/pool"
+  top: "inception_4e/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4e/pool_proj"
+  top: "inception_4e/pool_proj"
+}
+layer {
+  name: "inception_4e/output"
+  type: "Concat"
+  bottom: "inception_4e/1x1"
+  bottom: "inception_4e/3x3"
+  bottom: "inception_4e/5x5"
+  bottom: "inception_4e/pool_proj"
+  top: "inception_4e/output"
+}
+layer {
+  name: "pool4/3x3_s2"
+  type: "Pooling"
+  bottom: "inception_4e/output"
+  top: "pool4/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "inception_5a/1x1"
+  type: "Convolution"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_5a/1x1"
+  top: "inception_5a/1x1"
+}
+layer {
+  name: "inception_5a/3x3_reduce"
+  type: "Convolution"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 160
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_5a/3x3_reduce"
+  top: "inception_5a/3x3_reduce"
+}
+layer {
+  name: "inception_5a/3x3"
+  type: "Convolution"
+  bottom: "inception_5a/3x3_reduce"
+  top: "inception_5a/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 320
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_5a/3x3"
+  top: "inception_5a/3x3"
+}
+layer {
+  name: "inception_5a/5x5_reduce"
+  type: "Convolution"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_5a/5x5_reduce"
+  top: "inception_5a/5x5_reduce"
+}
+layer {
+  name: "inception_5a/5x5"
+  type: "Convolution"
+  bottom: "inception_5a/5x5_reduce"
+  top: "inception_5a/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_5a/5x5"
+  top: "inception_5a/5x5"
+}
+layer {
+  name: "inception_5a/pool"
+  type: "Pooling"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_5a/pool_proj"
+  type: "Convolution"
+  bottom: "inception_5a/pool"
+  top: "inception_5a/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_5a/pool_proj"
+  top: "inception_5a/pool_proj"
+}
+layer {
+  name: "inception_5a/output"
+  type: "Concat"
+  bottom: "inception_5a/1x1"
+  bottom: "inception_5a/3x3"
+  bottom: "inception_5a/5x5"
+  bottom: "inception_5a/pool_proj"
+  top: "inception_5a/output"
+}
+layer {
+  name: "inception_5b/1x1"
+  type: "Convolution"
+  bottom: "inception_5a/output"
+  top: "inception_5b/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_5b/1x1"
+  top: "inception_5b/1x1"
+}
+layer {
+  name: "inception_5b/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_5a/output"
+  top: "inception_5b/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_5b/3x3_reduce"
+  top: "inception_5b/3x3_reduce"
+}
+layer {
+  name: "inception_5b/3x3"
+  type: "Convolution"
+  bottom: "inception_5b/3x3_reduce"
+  top: "inception_5b/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_5b/3x3"
+  top: "inception_5b/3x3"
+}
+layer {
+  name: "inception_5b/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_5a/output"
+  top: "inception_5b/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 48
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_5b/5x5_reduce"
+  top: "inception_5b/5x5_reduce"
+}
+layer {
+  name: "inception_5b/5x5"
+  type: "Convolution"
+  bottom: "inception_5b/5x5_reduce"
+  top: "inception_5b/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_5b/5x5"
+  top: "inception_5b/5x5"
+}
+layer {
+  name: "inception_5b/pool"
+  type: "Pooling"
+  bottom: "inception_5a/output"
+  top: "inception_5b/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_5b/pool_proj"
+  type: "Convolution"
+  bottom: "inception_5b/pool"
+  top: "inception_5b/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_5b/pool_proj"
+  top: "inception_5b/pool_proj"
+}
+layer {
+  name: "inception_5b/output"
+  type: "Concat"
+  bottom: "inception_5b/1x1"
+  bottom: "inception_5b/3x3"
+  bottom: "inception_5b/5x5"
+  bottom: "inception_5b/pool_proj"
+  top: "inception_5b/output"
+}
+layer {
+  name: "pool5/7x7_s1"
+  type: "Pooling"
+  bottom: "inception_5b/output"
+  top: "pool5/7x7_s1"
+  pooling_param {
+    pool: AVE
+    kernel_size: 7
+    stride: 1
+  }
+}
+layer {
+  name: "pool5/drop_7x7_s1"
+  type: "Dropout"
+  bottom: "pool5/7x7_s1"
+  top: "pool5/7x7_s1"
+  dropout_param {
+    dropout_ratio: 0.4
+  }
+}
+layer {
+  name: "loss3/classifier"
+  type: "InnerProduct"
+  bottom: "pool5/7x7_s1"
+  top: "loss3/classifier"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss3/loss3"
+  type: "SoftmaxWithLoss"
+  bottom: "loss3/classifier"
+  bottom: "label"
+  top: "loss3/loss3"
+  loss_weight: 1
+}
--- a/benchmark/caffe/image/run.sh
+++ b/benchmark/caffe/image/run.sh
+set -e
+
+function test() {
+  cfg=$1
+  batch=$2
+  prefix=$3
+  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg 
+  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
+  caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet.prototxt 64 alexnet 
+test alexnet.prototxt 128 alexnet 
+test alexnet.prototxt 256 alexnet 
+test alexnet.prototxt 512 alexnet 
+
+# googlenet
+test googlenet.prototxt 64 googlenet 
+test googlenet.prototxt 128 googlenet 
+
+# small net 
+test smallnet_mnist_cifar.prototxt 64 smallnet 
+test smallnet_mnist_cifar.prototxt 128 smallnet 
+test smallnet_mnist_cifar.prototxt 256 smallnet 
+test smallnet_mnist_cifar.prototxt 512 smallnet 
--- a/benchmark/caffe/image/run_multi.sh
+++ b/benchmark/caffe/image/run_multi.sh
+#!/bin/bash
+set -e
+
+function test() {
+  cfg=$1
+  batch=$2
+  prefix=$3
+  batch_per_gpu=`expr ${batch} / 4`
+  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
+  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
+  sed -i "1c\net : \"${cfg}\"" solver.prototxt
+  caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet.prototxt 512 alexnet 
+test alexnet.prototxt 1024 alexnet 
+
+# googlnet 
+test googlenet.prototxt 512 googlenet 
--- a/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
+++ b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
+name: "mnist/cifar"
+input: "data"
+input_dim: 128 
+input_dim: 3
+input_dim: 32 
+input_dim: 32 
+input: "label"
+input_dim: 128 
+input_dim: 1
+input_dim: 1
+input_dim: 1 
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 32
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.0001
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "pool1"
+  top: "pool1"
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 32
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: AVE
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3"
+  top: "pool3"
+  pooling_param {
+    pool: AVE
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool3"
+  top: "ip1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 64
+    weight_filler {
+      type: "gaussian"
+      std: 0.1
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "ip2"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "ip2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "gaussian"
+      std: 0.1
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
--- a/benchmark/caffe/image/solver.prototxt
+++ b/benchmark/caffe/image/solver.prototxt
+net: "alexnet.prototxt"
+base_lr: 0.01
+lr_policy: "fixed"
+display: 20
+max_iter: 200
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 10000
+snapshot_prefix: "models/caffe_alexnet_train"
+solver_mode: GPU
--- a/benchmark/figs/alexnet-4gpu.png
+++ b/benchmark/figs/alexnet-4gpu.png
--- a/benchmark/figs/googlenet-4gpu.png
+++ b/benchmark/figs/googlenet-4gpu.png
--- a/benchmark/figs/rnn_lstm_4gpus.png
+++ b/benchmark/figs/rnn_lstm_4gpus.png
--- a/benchmark/figs/rnn_lstm_cls.png
+++ b/benchmark/figs/rnn_lstm_cls.png
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
+#!/usr/bin/env python
+
+from paddle.trainer_config_helpers import *
+
+height = 227
+width = 227
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 128)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+# conv1
+net = data_layer('data', size=height * width * 3)
+net = img_conv_layer(
+    input=net,
+    filter_size=11,
+    num_channels=3,
+    num_filters=96,
+    stride=4,
+    padding=1)
+net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+# conv2
+net = img_conv_layer(
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+# conv3
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
+# conv4
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+
+# conv5
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+net = fc_layer(
+    input=net,
+    size=4096,
+    act=ReluActivation(),
+    layer_attr=ExtraAttr(drop_rate=0.5))
+net = fc_layer(
+    input=net,
+    size=4096,
+    act=ReluActivation(),
+    layer_attr=ExtraAttr(drop_rate=0.5))
+net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
+
+lab = data_layer('label', num_class)
+loss = cross_entropy(input=net, label=lab)
+outputs(loss)
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 128)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+def inception2(name, input, channels, \
+    filter1,
+    filter3R, filter3,
+    filter5R, filter5,
+    proj):
+
+    conv1 = name + '_1'
+    conv3r = name + '_3r'
+    conv3 = name + '_3'
+    conv5r = name + '_5r'
+    conv5 = name + '_5'
+    maxpool = name + '_max'
+    convproj = name + '_proj'
+
+    cov1 = img_conv_layer(
+        name=conv1,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter1,
+        stride=1,
+        padding=0)
+
+    cov3r = img_conv_layer(
+        name=conv3r,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter3R,
+        stride=1,
+        padding=0)
+    cov3 = img_conv_layer(
+        name=conv3,
+        input=cov3r,
+        filter_size=3,
+        num_filters=filter3,
+        stride=1,
+        padding=1)
+
+    cov5r = img_conv_layer(
+        name=conv5r,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter5R,
+        stride=1,
+        padding=0)
+    cov5 = img_conv_layer(
+        name=conv5,
+        input=cov5r,
+        filter_size=5,
+        num_filters=filter5,
+        stride=1,
+        padding=2)
+
+    pool1 = img_pool_layer(
+        name=maxpool,
+        input=input,
+        pool_size=3,
+        num_channels=channels,
+        stride=1,
+        padding=1)
+    covprj = img_conv_layer(
+        name=convproj,
+        input=pool1,
+        filter_size=1,
+        num_filters=proj,
+        stride=1,
+        padding=0)
+
+    cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj])
+    return cat
+
+def inception(name, input, channels, \
+    filter1,
+    filter3R, filter3,
+    filter5R, filter5,
+    proj):
+
+    cov1 = conv_projection(
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter1,
+        stride=1,
+        padding=0)
+
+    cov3r = img_conv_layer(
+        name=name + '_3r',
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter3R,
+        stride=1,
+        padding=0)
+    cov3 = conv_projection(
+        input=cov3r, filter_size=3, num_filters=filter3, stride=1, padding=1)
+
+    cov5r = img_conv_layer(
+        name=name + '_5r',
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter5R,
+        stride=1,
+        padding=0)
+    cov5 = conv_projection(
+        input=cov5r, filter_size=5, num_filters=filter5, stride=1, padding=2)
+
+    pool1 = img_pool_layer(
+        name=name + '_max',
+        input=input,
+        pool_size=3,
+        num_channels=channels,
+        stride=1,
+        padding=1)
+    covprj = conv_projection(
+        input=pool1, filter_size=1, num_filters=proj, stride=1, padding=0)
+
+    cat = concat_layer(
+        name=name,
+        input=[cov1, cov3, cov5, covprj],
+        bias_attr=True,
+        act=ReluActivation())
+    return cat
+
+
+lab = data_layer(name="label", size=1000)
+data = data_layer(name="input", size=3 * height * width)
+
+# stage 1
+conv1 = img_conv_layer(
+    name="conv1",
+    input=data,
+    filter_size=7,
+    num_channels=3,
+    num_filters=64,
+    stride=2,
+    padding=3)
+pool1 = img_pool_layer(
+    name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
+
+# stage 2
+conv2_1 = img_conv_layer(
+    name="conv2_1",
+    input=pool1,
+    filter_size=1,
+    num_filters=64,
+    stride=1,
+    padding=0)
+conv2_2 = img_conv_layer(
+    name="conv2_2",
+    input=conv2_1,
+    filter_size=3,
+    num_filters=192,
+    stride=1,
+    padding=1)
+pool2 = img_pool_layer(
+    name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
+
+# stage 3
+ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
+ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
+pool3 = img_pool_layer(
+    name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
+
+# stage 4
+ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
+ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
+ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
+ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
+ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
+pool4 = img_pool_layer(
+    name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
+
+# stage 5
+ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
+ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
+pool5 = img_pool_layer(
+    name="pool5",
+    input=ince5b,
+    num_channels=1024,
+    pool_size=7,
+    stride=7,
+    pool_type=AvgPooling())
+
+# We remove loss1 and loss2 for all system when testing benchmark
+# output 1
+# pool_o1 = img_pool_layer(name="pool_o1", input=ince4a, num_channels=512, pool_size=5, stride=3, pool_type=AvgPooling())
+# conv_o1 = img_conv_layer(name="conv_o1", input=pool_o1, filter_size=1, num_filters=128, stride=1, padding=0)
+# fc_o1 = fc_layer(name="fc_o1", input=conv_o1, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
+# out1 = fc_layer(name="output1", input=fc_o1,  size=1000, act=SoftmaxActivation())
+# loss1 = cross_entropy(name='loss1', input=out1, label=lab, coeff=0.3) 
+
+# output 2
+#pool_o2 = img_pool_layer(name="pool_o2", input=ince4d, num_channels=528, pool_size=5, stride=3, pool_type=AvgPooling())
+#conv_o2 = img_conv_layer(name="conv_o2", input=pool_o2, filter_size=1, num_filters=128, stride=1, padding=0)
+#fc_o2 = fc_layer(name="fc_o2", input=conv_o2, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
+#out2 = fc_layer(name="output2", input=fc_o2, size=1000, act=SoftmaxActivation())
+#loss2 = cross_entropy(name='loss2', input=out2, label=lab, coeff=0.3) 
+
+# output 3
+dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
+out3 = fc_layer(
+    name="output3", input=dropout, size=1000, act=SoftmaxActivation())
+loss3 = cross_entropy(name='loss3', input=out3, label=lab)
+
+outputs(loss3)
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
+import io, os
+import random
+import numpy as np
+from paddle.trainer.PyDataProvider2 import *
+
+
+def initHook(settings, height, width, color, num_class, **kwargs):
+    settings.height = height
+    settings.width = width
+    settings.color = color
+    settings.num_class = num_class
+    if settings.color:
+        settings.data_size = settings.height * settings.width * 3
+    else:
+        settings.data_size = settings.height * settings.width
+
+    settings.slots = [dense_vector(settings.data_size), integer_value(1)]
+
+
+@provider(
+    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_list):
+    for i in xrange(1024):
+        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
+        lab = random.randint(0, settings.num_class)
+        yield img.astype('float32'), int(lab)
--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
+set -e
+
+function train() {
+  cfg=$1
+  thread=$2
+  bz=$3
+  args="batch_size=$3"
+  prefix=$4
+  paddle train --job=time \
+    --config=$cfg \
+    --use_gpu=True \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    > logs/$prefix-${thread}gpu-$bz.log 2>&1 
+}
+
+if [ ! -d "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+#========single-gpu=========#
+# alexnet
+train alexnet.py 1 64 alexnet
+train alexnet.py 1 128 alexnet
+train alexnet.py 1 256 alexnet
+train alexnet.py 1 512 alexnet
+
+# googlenet
+train googlenet.py 1 64 googlenet
+train googlenet.py 1 128 googlenet
+train googlenet.py 1 256 googlenet
+
+# smallnet
+train smallnet_mnist_cifar.py 1 64 smallnet
+train smallnet_mnist_cifar.py 1 128 smallnet
+train smallnet_mnist_cifar.py 1 256 smallnet
+train smallnet_mnist_cifar.py 1 512 smallnet
+
+
+############################
+#========multi-gpus=========#
+train alexnet.py 4 512 alexnet
+train alexnet.py 4 1024 alexnet
+
+train googlenet.py 4 512 googlenet 
+train googlenet.py 4 1024 googlenet
--- a/benchmark/paddle/image/smallnet_mnist_cifar.py
+++ b/benchmark/paddle/image/smallnet_mnist_cifar.py
+#!/usr/bin/env python
+
+from paddle.trainer_config_helpers import *
+
+height = 32
+width = 32
+num_class = 10
+
+batch_size = get_config_arg('batch_size', int, 128)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+# conv1
+net = data_layer('data', size=height * width * 3)
+net = img_conv_layer(
+    input=net,
+    filter_size=5,
+    num_channels=3,
+    num_filters=32,
+    stride=1,
+    padding=2)
+net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1)
+
+# conv2
+net = img_conv_layer(
+    input=net, filter_size=5, num_filters=32, stride=1, padding=2)
+net = img_pool_layer(
+    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
+
+# conv3
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=64, stride=1, padding=1)
+net = img_pool_layer(
+    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
+
+net = fc_layer(input=net, size=64, act=ReluActivation())
+net = fc_layer(input=net, size=10, act=SoftmaxActivation())
+
+lab = data_layer('label', num_class)
+loss = classification_cost(input=net, label=lab)
+outputs(loss)
--- a/benchmark/paddle/rnn/imdb.py
+++ b/benchmark/paddle/rnn/imdb.py
+from __future__ import print_function
+import six.moves.cPickle as pickle
+import gzip
+import os
+import numpy
+
+
+def get_dataset_file(dataset, default_dataset, origin):
+    data_dir, data_file = os.path.split(dataset)
+    if (not os.path.isfile(dataset)) and data_file == default_dataset:
+        from six.moves import urllib
+        print('Downloading data from %s' % origin)
+        urllib.request.urlretrieve(origin, dataset)
+
+    return dataset
+
+
+def create_data(path="imdb.pkl"):
+
+    if (not os.path.isfile('imdb.train.pkl')):
+        path = get_dataset_file(
+            path, "imdb.pkl",
+            "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
+
+        if path.endswith(".gz"):
+            f = gzip.open(path, 'rb')
+        else:
+            f = open(path, 'rb')
+
+        train_set = pickle.load(f)
+        test_set = pickle.load(f)
+        f.close()
+
+        pickle.dump(train_set, open('imdb.train.pkl', 'wb'))
+        pickle.dump(test_set, open('imdb.test.pkl', 'wb'))
+
+    if (not os.path.isfile('train.list')):
+        file('train.list', 'w').write('imdb.train.pkl\n')
+
+
+def main():
+    create_data('imdb.pkl')
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark/paddle/rnn/provider.py
+++ b/benchmark/paddle/rnn/provider.py
+import io, os
+import random
+import numpy as np
+import six.moves.cPickle as pickle
+from paddle.trainer.PyDataProvider2 import *
+
+
+def remove_unk(x, n_words):
+    return [[1 if w >= n_words else w for w in sen] for sen in x]
+
+
+# ==============================================================
+#  tensorflow uses fixed length, but PaddlePaddle can process
+#  variable-length. Padding is used in benchmark in order to
+#  compare with other platform. 
+# ==============================================================
+def pad_sequences(sequences,
+                  maxlen=None,
+                  dtype='int32',
+                  padding='post',
+                  truncating='post',
+                  value=0.):
+    lengths = [len(s) for s in sequences]
+
+    nb_samples = len(sequences)
+    if maxlen is None:
+        maxlen = np.max(lengths)
+
+    x = (np.ones((nb_samples, maxlen)) * value).astype(dtype)
+    for idx, s in enumerate(sequences):
+        if len(s) == 0:
+            continue  # empty list was found
+        if truncating == 'pre':
+            trunc = s[-maxlen:]
+        elif truncating == 'post':
+            trunc = s[:maxlen]
+        else:
+            raise ValueError("Truncating type '%s' not understood" % padding)
+
+        if padding == 'post':
+            x[idx, :len(trunc)] = trunc
+        elif padding == 'pre':
+            x[idx, -len(trunc):] = trunc
+        else:
+            raise ValueError("Padding type '%s' not understood" % padding)
+    return x
+
+
+def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs):
+    settings.vocab_size = vocab_size
+    settings.pad_seq = pad_seq
+    settings.maxlen = maxlen
+    settings.input_types = [
+        integer_value_sequence(vocab_size), integer_value(2)
+    ]
+
+
+@provider(
+    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file):
+    f = open(file, 'rb')
+    train_set = pickle.load(f)
+    f.close()
+    x, y = train_set
+
+    # remove unk, namely remove the words out of dictionary
+    x = remove_unk(x, settings.vocab_size)
+    if settings.pad_seq:
+        x = pad_sequences(x, maxlen=settings.maxlen, value=0.)
+
+    for i in range(len(y)):
+        yield map(int, x[i]), int(y[i])
--- a/benchmark/paddle/rnn/rnn.py
+++ b/benchmark/paddle/rnn/rnn.py
+#!/usr/bin/env python
+
+from paddle.trainer_config_helpers import *
+import imdb
+
+num_class = 2
+vocab_size = 30000
+fixedlen = 100
+batch_size = get_config_arg('batch_size', int, 128)
+lstm_num = get_config_arg('lstm_num', int, 1)
+hidden_size = get_config_arg('hidden_size', int, 128)
+# whether to pad sequence into fixed length
+pad_seq = get_config_arg('pad_seq', bool, True)
+imdb.create_data('imdb.pkl')
+
+args = {'vocab_size': vocab_size, 'pad_seq': pad_seq, 'maxlen': fixedlen}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25)
+
+net = data_layer('data', size=vocab_size)
+net = embedding_layer(input=net, size=128)
+
+for i in xrange(lstm_num):
+    net = simple_lstm(input=net, size=hidden_size)
+
+net = last_seq(input=net)
+net = fc_layer(input=net, size=2, act=SoftmaxActivation())
+
+lab = data_layer('label', num_class)
+loss = classification_cost(input=net, label=lab)
+outputs(loss)
--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
+set -e
+
+function train() {
+  cfg=$1
+  thread=$2
+  args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}"
+  paddle train --job=time \
+    --config=$cfg \
+    --use_gpu=1 \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --num_passes=1 \
+    --feed_data=1 \
+    --config_args=$args \
+    >logs/rnn-pad${4}-${thread}gpu-lstm${3}-batch${6}-hid${5}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+## padding, single gpu
+#-----config--gpu--lstm_num--padding--hidden_size--batch_size
+## lstm_num=2, batch_size=64
+train rnn.py 1 2 1 256 64 
+train rnn.py 1 2 1 512 64 
+train rnn.py 1 2 1 1280 64 
+
+## lstm_num=2, batch_size=128
+train rnn.py 1 2 1 256 128 
+train rnn.py 1 2 1 512 128 
+train rnn.py 1 2 1 1280 128 
+
+## lstm_num=4, batch_size=256
+train rnn.py 1 2 1 256 256 
+train rnn.py 1 2 1 512 256 
+train rnn.py 1 2 1 1280 256 
+
+
+#==================multi gpus=====================#
+# hidden_size=256, lstm_num=2, different batch size
+train rnn.py 4 2 1 256 128 
+train rnn.py 4 2 1 256 256 
+train rnn.py 4 2 1 256 512 
+
+# hidden_size=512, lstm_num=4, different batch size
+train rnn.py 4 2 1 512 128 
+train rnn.py 4 2 1 512 256 
+train rnn.py 4 2 1 512 512 
--- a/benchmark/tensorflow/image/alexnet.py
+++ b/benchmark/tensorflow/image/alexnet.py
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+
+def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32)
+
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope)
+        return conv1
+
+
+def _affine(name, inpOp, nIn, nOut, wd=0.0005, act=True, drop=None):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        output = tf.nn.dropout(affine1, drop) if drop else affine1
+
+        return output
+
+
+def _mpool(name, inpOp, kH, kW, dH, dW):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding='VALID',
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _norm(name, l_input, lsize=4):
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
+                     alpha=0.001 / 9.0,
+                     beta=0.75,
+                     name=name)
+
+
+def loss(logits, labels):
+    labels = tf.cast(labels, tf.int64)
+    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+
+    # The total loss is defined as the cross entropy loss plus all of the weight
+    # decay terms (L2 loss).
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def inference(images):
+    conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
+    norm1 = _norm('norm1', pool1, lsize=5)
+    conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
+    pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
+    norm2 = _norm('norm2', pool2, lsize=5)
+    conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
+    conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
+    conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
+    pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
+    resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
+    affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096, 0.5)
+    affn2 = _affine('fc7', affn1, 4096, 4096, 0.5)
+    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False)  # last fc
+
+    return affn3
+
+
+def time_tensorflow_run(session, target, info_string):
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def _add_loss_summaries(total_loss):
+    """
+  Generates moving average for all losses and associated summaries for
+  visualizing the performance of the network.
+
+  Args:
+    total_loss: Total loss from loss().
+  Returns:
+    loss_averages_op: op for generating moving averages of losses.
+  """
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    losses = tf.get_collection('losses')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(l.op.name + ' (raw)', l)
+        tf.scalar_summary(l.op.name, loss_averages.average(l))
+
+    return loss_averages_op
+
+
+def run_benchmark():
+    with tf.Graph().as_default():
+        with tf.device('/gpu:0'):
+            # Generate some dummy images.
+            image_size = 224
+            # Note that our padding definition is slightly different the cuda-convnet.
+            # In order to force the model to start with the same activations sizes,
+            # we add 3 to the image_size and employ VALID padding above.
+            if FLAGS.data_format == 'NCHW':
+                image_shape = [
+                    FLAGS.batch_size, 3, image_size + 3, image_size + 3
+                ]
+            else:
+                image_shape = [
+                    FLAGS.batch_size, image_size + 3, image_size + 3, 3
+                ]
+            images = tf.get_variable(
+                'image',
+                image_shape,
+                initializer=tf.truncated_normal_initializer(
+                    stddev=0.1, dtype=tf.float32),
+                dtype=tf.float32,
+                trainable=False)
+
+            labels = tf.get_variable(
+                'label', [FLAGS.batch_size],
+                initializer=tf.constant_initializer(1),
+                dtype=tf.int32,
+                trainable=False)
+
+            # Build a Graph that computes the logits predictions from the
+            # inference model.
+            last_layer = inference(images)
+
+            objective = loss(last_layer, labels)
+            # Compute the gradient with respect to all the parameters.
+
+            # Compute gradients.
+            # opt = tf.train.GradientDescentOptimizer(0.001)
+            opt = tf.train.MomentumOptimizer(0.001, 0.9)
+            grads = opt.compute_gradients(objective)
+            global_step = tf.get_variable(
+                'global_step', [],
+                initializer=tf.constant_initializer(
+                    0.0, dtype=tf.float32),
+                trainable=False,
+                dtype=tf.float32)
+            apply_gradient_op = opt.apply_gradients(
+                grads, global_step=global_step)
+
+            # Track the moving averages of all trainable variables.
+            variable_averages = tf.train.ExponentialMovingAverage(0.9,
+                                                                  global_step)
+            variables_averages_op = variable_averages.apply(
+                tf.trainable_variables())
+
+            # Build an initialization operation.
+            init = tf.initialize_all_variables()
+
+            # Start running operations on the Graph.
+            sess = tf.Session(config=tf.ConfigProto(
+                allow_soft_placement=True,
+                log_device_placement=FLAGS.log_device_placement))
+            sess.run(init)
+
+            run_forward = True
+            run_forward_backward = True
+            if FLAGS.forward_only and FLAGS.forward_backward_only:
+                raise ValueError("Cannot specify --forward_only and "
+                                 "--forward_backward_only at the same time.")
+            if FLAGS.forward_only:
+                run_forward_backward = False
+            elif FLAGS.forward_backward_only:
+                run_forward = False
+
+            if run_forward:
+                time_tensorflow_run(sess, last_layer, "Forward")
+
+            if run_forward_backward:
+                with tf.control_dependencies(
+                    [apply_gradient_op, variables_averages_op]):
+                    train_op = tf.no_op(name='train')
+                time_tensorflow_run(sess, [train_op, objective],
+                                    "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
--- a/benchmark/tensorflow/image/alexnet_multi_gpu.py
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import re
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+
+tf.app.flags.DEFINE_string('train_dir', '/train_model',
+                           """Directory where to write event logs """
+                           """and checkpoint.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
+LEARNING_RATE_DECAY_FACTOR = 0.1
+TOWER_NAME = 'tower'
+
+
+def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32)
+
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope)
+        return conv1
+
+
+def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return affine1
+
+
+def _mpool(name, inpOp, kH, kW, dH, dW):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding='VALID',
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _norm(name, l_input, lsize=4):
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
+                     alpha=0.001 / 9.0,
+                     beta=0.75,
+                     name=name)
+
+
+def loss(logits, labels):
+    labels = tf.cast(labels, tf.int64)
+    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+
+    # The total loss is defined as the cross entropy loss plus all of the weight
+    # decay terms (L2 loss).
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def inference(images):
+    conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
+    norm1 = _norm('norm1', pool1, lsize=5)
+    conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
+    pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
+    norm2 = _norm('norm2', pool2, lsize=5)
+    conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
+    conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
+    conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
+    pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
+    resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
+    affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096)
+    affn2 = _affine('fc7', affn1, 4096, 4096)
+    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False)  # last fc
+
+    return affn3
+
+
+def tower_loss(scope):
+    """Calculate the total loss on a single tower running the model.
+    Args:
+        scope: unique prefix string identifying the tower, e.g. 'tower_0'
+    Returns:
+        Tensor of shape [] containing the total loss for a batch of data
+    """
+    image_size = 224
+    if FLAGS.data_format == 'NCHW':
+        image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3]
+    else:
+        image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3]
+    images = tf.get_variable(
+        'image',
+        image_shape,
+        initializer=tf.truncated_normal_initializer(
+            stddev=0.1, dtype=tf.float32),
+        dtype=tf.float32,
+        trainable=False)
+
+    labels = tf.get_variable(
+        'label', [FLAGS.batch_size],
+        initializer=tf.constant_initializer(1),
+        dtype=tf.int32,
+        trainable=False)
+
+    # Build a Graph that computes the logits predictions from the
+    # inference model.
+    last_layer = inference(images)
+
+    # Build the portion of the Graph calculating the losses. Note that we will
+    # assemble the total_loss using a custom function below.
+    _ = loss(last_layer, labels)
+
+    # Assemble all of the losses for the current tower only.
+    losses = tf.get_collection('losses', scope)
+
+    # Calculate the total loss for the current tower.
+    total_loss = tf.add_n(losses, name='total_loss')
+
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+        # session. This helps the clarity of presentation on tensorboard.
+        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(loss_name + ' (raw)', l)
+        tf.scalar_summary(loss_name, loss_averages.average(l))
+
+    with tf.control_dependencies([loss_averages_op]):
+        total_loss = tf.identity(total_loss)
+    return total_loss
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+  Note that this function provides a synchronization point across all towers.
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over individual gradients. The inner list is over the gradient
+      calculation for each tower.
+  Returns:
+     List of pairs of (gradient, variable) where the gradient has been averaged
+     across all towers.
+  """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def time_tensorflow_run(session, target):
+    num_steps_burn_in = 50
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size = %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
+
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Decay the learning rate exponentially based on the number of steps.
+        lr = tf.train.exponential_decay(
+            INITIAL_LEARNING_RATE,
+            global_step,
+            decay_steps,
+            LEARNING_RATE_DECAY_FACTOR,
+            staircase=True)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.MomentumOptimizer(lr, 0.9)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
--- a/benchmark/tensorflow/image/googlenet.py
+++ b/benchmark/tensorflow/image/googlenet.py
+from six.moves import xrange
+from datetime import datetime
+import math
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+parameters = []
+
+conv_counter = 1
+pool_counter = 1
+affine_counter = 1
+
+
+def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
+    global conv_counter
+    global parameters
+    name = 'conv' + str(conv_counter)
+    conv_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+        conv1 = tf.nn.relu(bias, name=scope)
+        parameters += [kernel, biases]
+        return conv1
+
+
+def _affine(inpOp, nIn, nOut, act=True, wd=0.0005):
+    global affine_counter
+    global parameters
+    name = 'affine' + str(affine_counter)
+    affine_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        affine1 = tf.nn.relu_layer(
+            inpOp, kernel, biases,
+            name=name) if act else tf.matmul(inpOp, kernel) + biases
+        parameters += [kernel, biases]
+        return affine1
+
+
+def _mpool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _apool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _inception(inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
+    conv1 = _conv(inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
+
+    conv3_ = _conv(inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv(conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
+
+    conv5_ = _conv(inp, inSize, o3s1, 1, 1, 1, 1, 'VALID')
+    conv5 = _conv(conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME')
+
+    pool_ = _mpool(inp, o4s1, o4s1, 1, 1, 'SAME')
+    pool = _conv(pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
+
+    if FLAGS.data_format == 'NCHW':
+        channel_dim = 1
+    else:
+        channel_dim = 3
+    incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
+    return incept
+
+
+def loss(logits, labels):
+    batch_size = tf.size(labels)
+    labels = tf.expand_dims(labels, 1)
+    indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
+    concated = tf.concat(1, [indices, labels])
+    onehot_labels = tf.sparse_to_dense(concated,
+                                       tf.pack([batch_size, 1000]), 1.0, 0.0)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, onehot_labels, name='xentropy')
+    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+    return loss
+
+
+def inference(images):
+    # stage 1
+    conv1 = _conv(images, 3, 64, 7, 7, 2, 2, 'SAME')
+    pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
+    # stage 2
+    conv2 = _conv(pool1, 64, 64, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv(conv2, 64, 192, 3, 3, 1, 1, 'SAME')
+    pool3 = _mpool(conv3, 3, 3, 2, 2, 'SAME')
+
+    # stage 3
+    incept3a = _inception(pool3, 192, 64, 96, 128, 16, 32, 3, 32)
+    incept3b = _inception(incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
+    pool4 = _mpool(incept3b, 3, 3, 2, 2, 'SAME')
+
+    # stage 4
+    incept4a = _inception(pool4, 480, 192, 96, 208, 16, 48, 3, 64)
+    incept4b = _inception(incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
+    incept4c = _inception(incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
+    incept4d = _inception(incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
+    incept4e = _inception(incept4d, 528, 256, 160, 320, 32, 128, 3, 128)
+    pool5 = _mpool(incept4e, 3, 3, 2, 2, 'SAME')
+
+    # stage 5
+    incept5a = _inception(pool5, 832, 256, 160, 320, 32, 128, 3, 128)
+    incept5b = _inception(incept5a, 832, 384, 192, 384, 48, 128, 3, 128)
+    pool6 = _apool(incept5b, 7, 7, 1, 1, 'VALID')
+
+    # output 1
+    resh1 = tf.reshape(pool6, [-1, 1024])
+    drop = tf.nn.dropout(resh1, 0.4)
+    affn1 = _affine(resh1, 1024, 1000, act=False)
+
+    return affn1
+
+
+def time_tensorflow_run(session, target, info_string):
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in range(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    global parameters
+    with tf.Graph().as_default():
+        # Generate some dummy images.
+        image_size = 224
+        if FLAGS.data_format == 'NCHW':
+            image_shape = [FLAGS.batch_size, 3, image_size, image_size]
+        else:
+            image_shape = [FLAGS.batch_size, image_size, image_size, 3]
+
+        images = tf.get_variable(
+            'image',
+            image_shape,
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.1, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=False)
+
+        labels = tf.get_variable(
+            'label', [FLAGS.batch_size],
+            initializer=tf.constant_initializer(1),
+            dtype=tf.int32,
+            trainable=False)
+
+        # Build a Graph that computes the logits predictions from the
+        # inference model.
+        last_layer = inference(images)
+
+        objective = loss(last_layer, labels)
+
+        # Compute gradients.
+        # opt = tf.train.GradientDescentOptimizer(0.001)
+        opt = tf.train.MomentumOptimizer(0.001, 0.9)
+        grads = opt.compute_gradients(objective)
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(
+                0.0, dtype=tf.float32),
+            trainable=False,
+            dtype=tf.float32)
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Track the moving averages of all trainable variables.
+        variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
+        variables_averages_op = variable_averages.apply(tf.trainable_variables(
+        ))
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+
+        run_forward = True
+        run_forward_backward = True
+        if FLAGS.forward_only and FLAGS.forward_backward_only:
+            raise ValueError("Cannot specify --forward_only and "
+                             "--forward_backward_only at the same time.")
+        if FLAGS.forward_only:
+            run_forward_backward = False
+        elif FLAGS.forward_backward_only:
+            run_forward = False
+
+        if run_forward:
+            # Run the forward benchmark.
+            time_tensorflow_run(sess, last_layer, "Forward")
+
+        if run_forward_backward:
+            with tf.control_dependencies(
+                [apply_gradient_op, variables_averages_op]):
+                train_op = tf.no_op(name='train')
+            time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
--- a/benchmark/tensorflow/image/googlenet_multi_gpu.py
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import re
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+
+tf.app.flags.DEFINE_string('train_dir', '/train_model',
+                           """Directory where to write event logs """
+                           """and checkpoint.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
+LEARNING_RATE_DECAY_FACTOR = 0.1
+TOWER_NAME = 'tower'
+
+
+def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32)
+
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope)
+        return conv1
+
+
+def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return affine1
+
+
+def _mpool(name, inpOp, kH, kW, dH, dW, padding):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _apool(name, inpOp, kH, kW, dH, dW, padding):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def loss(logits, labels):
+    labels = tf.cast(labels, tf.int64)
+    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+
+    # The total loss is defined as the cross entropy loss plus all of the weight
+    # decay terms (L2 loss).
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def _inception(name, inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
+    conv1 = _conv(name + '_1', inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
+
+    conv3_ = _conv(name + '_3r', inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv(name + '_3', conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
+
+    conv5_ = _conv(name + '_5r', inp, inSize, o3s1, 1, 1, 1, 1, 'VALID')
+    conv5 = _conv(name + '5', conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME')
+
+    pool_ = _mpool(name + 'pool', inp, o4s1, o4s1, 1, 1, 'SAME')
+    pool = _conv(name + 'proj', pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
+
+    if FLAGS.data_format == 'NCHW':
+        channel_dim = 1
+    else:
+        channel_dim = 3
+    incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
+    return incept
+
+
+def inference(images):
+    # stage 1
+    conv1 = _conv('conv1', images, 3, 64, 7, 7, 2, 2, 'SAME')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2, 'SAME')
+
+    # stage 2
+    conv2 = _conv('conv2', pool1, 64, 64, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv('conv3', conv2, 64, 192, 3, 3, 1, 1, 'SAME')
+    pool3 = _mpool('pool3', conv3, 3, 3, 2, 2, 'SAME')
+
+    # stage 3
+    incept3a = _inception('ince3a', pool3, 192, 64, 96, 128, 16, 32, 3, 32)
+    incept3b = _inception('ince3b', incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
+    pool4 = _mpool('pool4', incept3b, 3, 3, 2, 2, 'SAME')
+
+    # stage 4
+    incept4a = _inception('ince4a', pool4, 480, 192, 96, 208, 16, 48, 3, 64)
+    incept4b = _inception('ince4b', incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
+    incept4c = _inception('ince4c', incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
+    incept4d = _inception('ince4d', incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
+    incept4e = _inception('ince4e', incept4d, 528, 256, 160, 320, 32, 128, 3,
+                          128)
+    pool5 = _mpool('pool5', incept4e, 3, 3, 2, 2, 'SAME')
+
+    # stage 5
+    incept5a = _inception('ince5a', pool5, 832, 256, 160, 320, 32, 128, 3, 128)
+    incept5b = _inception('ince5b', incept5a, 832, 384, 192, 384, 48, 128, 3,
+                          128)
+    pool6 = _apool('pool6', incept5b, 7, 7, 1, 1, 'VALID')
+
+    # output 1
+    resh1 = tf.reshape(pool6, [-1, 1024])
+    drop = tf.nn.dropout(resh1, 0.4)
+    affn1 = _affine('fc_out', resh1, 1024, 1000, act=False)
+
+    return affn1
+
+
+def tower_loss(scope):
+    """Calculate the total loss on a single tower running the model.
+    Args:
+        scope: unique prefix string identifying the tower, e.g. 'tower_0'
+    Returns:
+        Tensor of shape [] containing the total loss for a batch of data
+    """
+    image_size = 224
+    if FLAGS.data_format == 'NCHW':
+        image_shape = [FLAGS.batch_size, 3, image_size, image_size]
+    else:
+        image_shape = [FLAGS.batch_size, image_size, image_size, 3]
+    images = tf.get_variable(
+        'image',
+        image_shape,
+        initializer=tf.truncated_normal_initializer(
+            stddev=0.1, dtype=tf.float32),
+        dtype=tf.float32,
+        trainable=False)
+
+    labels = tf.get_variable(
+        'label', [FLAGS.batch_size],
+        initializer=tf.constant_initializer(1),
+        dtype=tf.int32,
+        trainable=False)
+
+    # Build a Graph that computes the logits predictions from the
+    # inference model.
+    last_layer = inference(images)
+
+    # Build the portion of the Graph calculating the losses. Note that we will
+    # assemble the total_loss using a custom function below.
+    _ = loss(last_layer, labels)
+
+    # Assemble all of the losses for the current tower only.
+    losses = tf.get_collection('losses', scope)
+
+    # Calculate the total loss for the current tower.
+    total_loss = tf.add_n(losses, name='total_loss')
+
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+        # session. This helps the clarity of presentation on tensorboard.
+        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(loss_name + ' (raw)', l)
+        tf.scalar_summary(loss_name, loss_averages.average(l))
+
+    with tf.control_dependencies([loss_averages_op]):
+        total_loss = tf.identity(total_loss)
+    return total_loss
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+  Note that this function provides a synchronization point across all towers.
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over individual gradients. The inner list is over the gradient
+      calculation for each tower.
+  Returns:
+     List of pairs of (gradient, variable) where the gradient has been averaged
+     across all towers.
+  """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def time_tensorflow_run(session, target):
+    num_steps_burn_in = 50
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size = %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
+
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Decay the learning rate exponentially based on the number of steps.
+        lr = tf.train.exponential_decay(
+            INITIAL_LEARNING_RATE,
+            global_step,
+            decay_steps,
+            LEARNING_RATE_DECAY_FACTOR,
+            staircase=True)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.MomentumOptimizer(lr, 0.9)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
+set -e
+
+function test() {
+  cfg=$1
+  batch_size=$2
+  prefix=$3
+  python $cfg --batch_size=$batch_size > logs/${prefix}-1gpu-${batch_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet.py 64 alexnet
+test alexnet.py 128 alexnet
+test alexnet.py 256 alexnet
+test alexnet.py 512 alexnet
+
+# googlenet
+test googlenet.py 64 googlenet
+test googlenet.py 128 googlenet
+
+# smallnet 
+test smallnet_mnist_cifar.py 64 smallnet
+test smallnet_mnist_cifar.py 128 smallnet
+test smallnet_mnist_cifar.py 256 smallnet
+test smallnet_mnist_cifar.py 512 smallnet
--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
+set -e
+
+function test() {
+  cfg=$1
+  num_gpu=$2
+  batch_size=$3
+  batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
+  prefix=$4
+  python $cfg --num_gpus=$num_gpu --batch_size=${batch_per_gpu} > logs/${prefix}-4gpu-${batch_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet_multi_gpu.py 4 512 alexnet
+test alexnet_multi_gpu.py 4 1024 alexnet
+
+# googlenet 
+test googlenet_multi_gpu.py 4 512 alexnet
+test googlenet_multi_gpu.py 4 1024 alexnet
--- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+parameters = []
+
+conv_counter = 1
+pool_counter = 1
+affine_counter = 1
+
+
+def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005, act=True):
+    global conv_counter
+    global parameters
+    name = 'conv' + str(conv_counter)
+    conv_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope) if act else bias
+
+        parameters += [kernel, biases]
+
+        return conv1
+
+
+def _affine(inpOp, nIn, nOut, wd=None, act=True):
+    global affine_counter
+    global parameters
+    name = 'affine' + str(affine_counter)
+    affine_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+
+        affine1 = tf.nn.relu_layer(
+            inpOp, kernel, biases,
+            name=name) if act else tf.matmul(inpOp, kernel) + biases
+
+        parameters += [kernel, biases]
+
+        return affine1
+
+
+def _mpool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _apool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _norm(name, l_input, lsize=4):
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
+                     alpha=0.001 / 9.0,
+                     beta=0.75,
+                     name=name)
+
+
+def loss(logits, labels):
+    batch_size = tf.size(labels)
+    labels = tf.expand_dims(labels, 1)
+    indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
+    concated = tf.concat(1, [indices, labels])
+    onehot_labels = tf.sparse_to_dense(concated,
+                                       tf.pack([batch_size, 10]), 1.0, 0.0)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, onehot_labels, name='xentropy')
+    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+    return loss
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def inference(images):
+    conv1 = _conv(images, 3, 32, 5, 5, 1, 1, 'SAME')
+    pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
+    conv2 = _conv(pool1, 32, 32, 5, 5, 1, 1, 'SAME')
+    pool2 = _apool(conv2, 3, 3, 2, 2, 'SAME')
+    conv3 = _conv(pool2, 32, 64, 5, 5, 1, 1, 'SAME')
+    pool3 = _apool(conv3, 3, 3, 2, 2, 'SAME')
+    resh1 = tf.reshape(pool3, [-1, 64 * 4 * 4])
+    affn1 = _affine(resh1, 64 * 4 * 4, 64)
+    affn2 = _affine(affn1, 64, 10, act=False)
+
+    print('conv1:', get_incoming_shape(conv1))
+    print('pool1:', get_incoming_shape(pool1))
+    print('conv2:', get_incoming_shape(conv2))
+    print('pool2:', get_incoming_shape(pool2))
+    print('conv3:', get_incoming_shape(conv3))
+    print('pool3:', get_incoming_shape(pool3))
+
+    return affn2
+
+
+def time_tensorflow_run(session, target, info_string):
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    global parameters
+    with tf.Graph().as_default():
+        # Generate some dummy images.
+        image_size = 32
+        # Note that our padding definition is slightly different the cuda-convnet.
+        # In order to force the model to start with the same activations sizes,
+        # we add 3 to the image_size and employ VALID padding above.
+        if FLAGS.data_format == 'NCHW':
+            image_shape = [FLAGS.batch_size, 3, image_size, image_size]
+        else:
+            image_shape = [FLAGS.batch_size, image_size, image_size, 3]
+
+        images = tf.get_variable(
+            'image',
+            image_shape,
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.1, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=False)
+
+        labels = tf.get_variable(
+            'label', [FLAGS.batch_size],
+            initializer=tf.constant_initializer(1),
+            dtype=tf.int32,
+            trainable=False)
+
+        # Build a Graph that computes the logits predictions from the
+        # inference model.
+        last_layer = inference(images)
+
+        objective = loss(last_layer, labels)
+
+        # Compute gradients.
+        opt = tf.train.MomentumOptimizer(0.001, 0.9)
+        grads = opt.compute_gradients(objective)
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(
+                0.0, dtype=tf.float32),
+            trainable=False,
+            dtype=tf.float32)
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Track the moving averages of all trainable variables.
+        variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
+        variables_averages_op = variable_averages.apply(tf.trainable_variables(
+        ))
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+
+        run_forward = True
+        run_forward_backward = True
+        if FLAGS.forward_only and FLAGS.forward_backward_only:
+            raise ValueError("Cannot specify --forward_only and "
+                             "--forward_backward_only at the same time.")
+        if FLAGS.forward_only:
+            run_forward_backward = False
+        elif FLAGS.forward_backward_only:
+            run_forward = False
+
+        if run_forward:
+            # Run the forward benchmark.
+            time_tensorflow_run(sess, last_layer, "Forward")
+
+        if run_forward_backward:
+            with tf.control_dependencies(
+                [apply_gradient_op, variables_averages_op]):
+                train_op = tf.no_op(name='train')
+            time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
--- a/benchmark/tensorflow/rnn/README.md
+++ b/benchmark/tensorflow/rnn/README.md
+You also should install tflearn:
+
+```bash
+pip install -r requirements.txt
+```
--- a/benchmark/tensorflow/rnn/reader.py
+++ b/benchmark/tensorflow/rnn/reader.py
+import os.path
+import io
+import numpy as np
+import tensorflow as tf
+
+# tflearn
+import tflearn
+from tflearn.data_utils import to_categorical, pad_sequences
+from tflearn.datasets import imdb
+
+FLAGS = tf.app.flags.FLAGS
+
+
+class DataSet(object):
+    def __init__(self, data, labels):
+        assert data.shape[0] == labels.shape[0], (
+            'data.shape: %s labels.shape: %s' % (data.shape, labels.shape))
+        self._num_examples = data.shape[0]
+
+        self._data = data
+        self._labels = labels
+        self._epochs_completed = 0
+        self._index_in_epoch = 0
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @property
+    def num_examples(self):
+        return self._num_examples
+
+    @property
+    def epochs_completed(self):
+        return self._epochs_completed
+
+    def next_batch(self, batch_size):
+        assert batch_size <= self._num_examples
+
+        start = self._index_in_epoch
+        self._index_in_epoch += batch_size
+        if self._index_in_epoch > self._num_examples:
+            # Finished epoch
+            self._epochs_completed += 1
+            # Shuffle the data
+            perm = np.arange(self._num_examples)
+            np.random.shuffle(perm)
+            self._data = self._data[perm]
+            self._labels = self._labels[perm]
+            # Start next epoch
+            start = 0
+            self._index_in_epoch = batch_size
+
+        end = self._index_in_epoch
+
+        return self._data[start:end], self._labels[start:end]
+
+
+def create_datasets(file_path, vocab_size=30000, val_fraction=0.0):
+
+    # IMDB Dataset loading
+    train, test, _ = imdb.load_data(
+        path=file_path,
+        n_words=vocab_size,
+        valid_portion=val_fraction,
+        sort_by_len=False)
+    trainX, trainY = train
+    testX, testY = test
+
+    # Data preprocessing
+    # Sequence padding
+    trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.)
+    testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.)
+    # Converting labels to binary vectors
+    trainY = to_categorical(trainY, nb_classes=2)
+    testY = to_categorical(testY, nb_classes=2)
+
+    train_dataset = DataSet(trainX, trainY)
+
+    return train_dataset
+
+
+def main():
+    create_datasets('imdb.pkl')
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark/tensorflow/rnn/requirements.txt
+++ b/benchmark/tensorflow/rnn/requirements.txt
+tflearn
--- a/benchmark/tensorflow/rnn/rnn.py
+++ b/benchmark/tensorflow/rnn/rnn.py
+#!/usr/bin/env python
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import math
+import time
+import numpy as np
+from datetime import datetime
+
+import reader
+import tensorflow as tf
+from tensorflow.python.ops import rnn
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('emb_size', 128, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+VOCAB_SIZE = 30000
+NUM_CLASS = 2
+
+
+def get_feed_dict(x_data, y_data=None):
+    feed_dict = {}
+
+    if y_data is not None:
+        feed_dict[y_input] = y_data
+
+    for i in xrange(x_data.shape[0]):
+        feed_dict[x_input[i]] = x_data[i, :, :]
+
+    return feed_dict
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+# Note input * W is done in LSTMCell, 
+# which is different from PaddlePaddle
+def single_lstm(name,
+                incoming,
+                n_units,
+                use_peepholes=True,
+                return_seq=False,
+                return_state=False):
+    with tf.name_scope(name) as scope:
+        cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
+        output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
+        out = output if return_seq else output[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def lstm(name,
+         incoming,
+         n_units,
+         use_peepholes=True,
+         return_seq=False,
+         return_state=False,
+         num_layers=1):
+    with tf.name_scope(name) as scope:
+        lstm_cell = tf.nn.rnn_cell.LSTMCell(
+            n_units, use_peepholes=use_peepholes)
+        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
+        initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
+        if not isinstance(incoming, list):
+            # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
+            incoming = [
+                tf.squeeze(input_, [1])
+                for input_ in tf.split(1, FLAGS.max_len, incoming)
+            ]
+        outputs, state = tf.nn.rnn(cell,
+                                   incoming,
+                                   initial_state=initial_state,
+                                   dtype=tf.float32)
+        out = outputs if return_seq else outputs[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def embedding(name, incoming, vocab_size, emb_size):
+    with tf.name_scope(name) as scope:
+        #with tf.device("/cpu:0"):
+        embedding = tf.get_variable(
+            name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
+        out = tf.nn.embedding_lookup(embedding, incoming)
+        return out
+
+
+def fc(name, inpOp, nIn, nOut, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return net
+
+
+def inference(seq):
+    net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size)
+    print "emb:", get_incoming_shape(net)
+    net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers)
+    print "lstm:", get_incoming_shape(net)
+    net = fc('fc1', net, FLAGS.hidden_size, 2)
+    return net
+
+
+def loss(logits, labels):
+    # one label index for one sample
+    labels = tf.cast(labels, tf.float32)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def time_tensorflow_run(session, target, x_input, y_input, info_string):
+    num_steps_burn_in = 50
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        data, label = train_dataset.next_batch(FLAGS.batch_size)
+        _ = session.run(target_op, feed_dict={x_input: data, y_input: label})
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default():
+        global_step = 0
+        with tf.device('/cpu:0'):
+            global_step = tf.Variable(0, trainable=False)
+        with tf.device('/gpu:0'):
+            #x_input = tf.placeholder(tf.int32, [None, FLAGS.max_len], name="x_input")
+            #y_input = tf.placeholder(tf.int32, [None, NUM_CLASS], name="y_input")
+            x_input = tf.placeholder(
+                tf.int32, [FLAGS.batch_size, FLAGS.max_len], name="x_input")
+            y_input = tf.placeholder(
+                tf.int32, [FLAGS.batch_size, NUM_CLASS], name="y_input")
+            # Generate some dummy sequnce.
+
+            last_layer = inference(x_input)
+
+            objective = loss(last_layer, y_input)
+            opt = tf.train.AdamOptimizer(0.001)
+            grads = opt.compute_gradients(objective)
+            apply_gradient_op = opt.apply_gradients(
+                grads, global_step=global_step)
+
+            init = tf.initialize_all_variables()
+            sess = tf.Session(config=tf.ConfigProto(
+                allow_soft_placement=True,
+                log_device_placement=FLAGS.log_device_placement))
+            sess.run(init)
+
+            run_forward = True
+            run_forward_backward = True
+            if FLAGS.forward_only and FLAGS.forward_backward_only:
+                raise ValueError("Cannot specify --forward_only and "
+                                 "--forward_backward_only at the same time.")
+            if FLAGS.forward_only:
+                run_forward_backward = False
+            elif FLAGS.forward_backward_only:
+                run_forward = False
+
+            if run_forward:
+                time_tensorflow_run(sess, last_layer, x_input, y_input,
+                                    "Forward")
+
+            if run_forward_backward:
+                with tf.control_dependencies([apply_gradient_op]):
+                    train_op = tf.no_op(name='train')
+                time_tensorflow_run(sess, [train_op, objective], x_input,
+                                    y_input, "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
--- a/benchmark/tensorflow/rnn/rnn_multi_gpu.py
+++ b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
+#!/usr/bin/env python
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import re
+import math
+import time
+import numpy as np
+from datetime import datetime
+
+import reader
+import tensorflow as tf
+from tensorflow.python.ops import rnn
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('emb_size', 64, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
+
+VOCAB_SIZE = 30000
+NUM_CLASS = 2
+
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
+LEARNING_RATE_DECAY_FACTOR = 0.1
+TOWER_NAME = 'tower'
+
+train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+# Note input * W is done in LSTMCell, 
+# which is different from PaddlePaddle
+def single_lstm(name,
+                incoming,
+                n_units,
+                use_peepholes=True,
+                return_seq=False,
+                return_state=False):
+    with tf.name_scope(name) as scope:
+        cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
+        output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
+        out = output if return_seq else output[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def lstm(name,
+         incoming,
+         n_units,
+         use_peepholes=True,
+         return_seq=False,
+         return_state=False,
+         num_layers=1):
+    with tf.name_scope(name) as scope:
+        lstm_cell = tf.nn.rnn_cell.LSTMCell(
+            n_units, use_peepholes=use_peepholes)
+        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
+        initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
+        if not isinstance(incoming, list):
+            # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
+            incoming = [
+                tf.squeeze(input_, [1])
+                for input_ in tf.split(1, FLAGS.max_len, incoming)
+            ]
+        outputs, state = tf.nn.rnn(cell,
+                                   incoming,
+                                   initial_state=initial_state,
+                                   dtype=tf.float32)
+        out = outputs if return_seq else outputs[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def embedding(name, incoming, vocab_size, emb_size):
+    with tf.name_scope(name) as scope:
+        #with tf.device("/cpu:0"):
+        embedding = tf.get_variable(
+            name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
+        out = tf.nn.embedding_lookup(embedding, incoming)
+        return out
+
+
+def fc(name, inpOp, nIn, nOut, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return net
+
+
+def inference(seq):
+    net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size)
+    print "emb:", get_incoming_shape(net)
+    net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers)
+    print "lstm:", get_incoming_shape(net)
+    net = fc('fc1', net, FLAGS.hidden_size, 2)
+    return net
+
+
+def loss(logits, labels):
+    # one label index for one sample
+    #labels = tf.cast(labels, tf.int64)
+    # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+    #                logits, labels, name='cross_entropy_per_example')
+    labels = tf.cast(labels, tf.float32)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def tower_loss(scope):
+    """Calculate the total loss on a single tower running the model.
+    Args:
+        scope: unique prefix string identifying the tower, e.g. 'tower_0'
+    Returns:
+        Tensor of shape [] containing the total loss for a batch of data
+    """
+    data, label = train_dataset.next_batch(FLAGS.batch_size)
+
+    # Build a Graph that computes the logits predictions from the
+    # inference model.
+    last_layer = inference(data)
+
+    # Build the portion of the Graph calculating the losses. Note that we will
+    # assemble the total_loss using a custom function below.
+    #_ = loss(last_layer, label)
+    _ = loss(last_layer, label)
+
+    # Assemble all of the losses for the current tower only.
+    losses = tf.get_collection('losses', scope)
+
+    # Calculate the total loss for the current tower.
+    total_loss = tf.add_n(losses, name='total_loss')
+
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+        # session. This helps the clarity of presentation on tensorboard.
+        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(loss_name + ' (raw)', l)
+        #tf.scalar_summary(loss_name, loss_averages.average(l))
+
+    with tf.control_dependencies([loss_averages_op]):
+        total_loss = tf.identity(total_loss)
+    return total_loss
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+  Note that this function provides a synchronization point across all towers.
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over individual gradients. The inner list is over the gradient
+      calculation for each tower.
+  Returns:
+     List of pairs of (gradient, variable) where the gradient has been averaged
+     across all towers.
+  """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def time_tensorflow_run(session, target):
+    num_steps_burn_in = 80
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target, feed_dict={x_input: data, y_input: label})
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                # sec_per_batch = duration / FLAGS.num_gpus
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss= %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size= %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
+
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.AdamOptimizer(0.001)
+
+        #train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    # summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
+set -e
+
+function test() {
+  lstm_num=$1
+  batch_size=$2
+  hid_size=$3
+  prefix=$4
+  python rnn.py --num_layers=${lstm_num} --batch_size=$batch_size \
+      --hidden_size=${hid_size} \
+      --forward_backward_only=1 \
+       > logs/1gpu-${lstm_num}lstm-batch${batch_size}-hid${hid_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+#--lstm_num--batch_size--hidden_size--#
+test 2 64 256 
+test 2 64 512 
+test 2 64 1280 
+
+test 2 128 256 
+test 2 128 512 
+test 2 128 1280 
+
+test 2 256 256 
+test 2 256 512 
+test 2 256 1280 
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
+set -e
+
+function test() {
+  num_gpu=$1
+  lstm_num=$2
+  hid_size=$3
+  batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
+  batch_size=$4
+  python rnn_multi_gpu.py --num_layers=${lstm_num} --batch_size=$batch_per_gpu \
+      --num_gpus=${num_gpu} \
+      --hidden_size=${hid_size} \
+      --forward_backward_only=1 \
+      > logs/${num_gpu}gpu-${lstm_num}lstm-hid${hid_size}-batch${batch_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+#--num_gpus--lstm_num--hiddne_size--batch_size--#
+test 4 2 256 128 
+test 4 2 256 256 
+test 4 2 256 512 
+
+test 4 2 512 128 
+test 4 2 512 256 
+test 4 2 512 512 
+
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
+# Get the latest git tag.
+set(PADDLE_VERSION $ENV{PADDLE_VERSION})
+set(tmp_version "HEAD")
+while ("${PADDLE_VERSION}" STREQUAL "")
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
+    WORKING_DIRECTORY ${PROJ_ROOT}
+    OUTPUT_VARIABLE GIT_TAG_NAME
+    RESULT_VARIABLE GIT_RESULT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if (NOT ${GIT_RESULT})
+    # Check the tag is a correct version
+    if (${GIT_TAG_NAME} MATCHES "v[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+      string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
+    else()  # otherwise, get the previous git tag name.
+      set(tmp_version "${GIT_TAG_NAME}~1")
+    endif()
+  else()
+    set(PADDLE_VERSION "0.0.0")
+    message(WARNING "Cannot add paddle version from git tag")
+  endif()
+endwhile()
+
+message(STATUS "Paddle version is ${PADDLE_VERSION}")
--- a/demo/gan/.gitignore
+++ b/demo/gan/.gitignore
+output/
+uniform_params/
+cifar_params/
+mnist_params/
+*.png
+.pydevproject
+.project
+*.log
+*.pyc
+data/mnist_data/
+data/cifar-10-batches-py/
--- a/demo/gan/README.md
+++ b/demo/gan/README.md
+# Generative Adversarial Networks (GAN) 
+
+This demo implements GAN training described in the original GAN paper (https://arxiv.org/abs/1406.2661) and DCGAN (https://arxiv.org/abs/1511.06434).
+
+The general training procedures are implemented in gan_trainer.py. The neural network configurations are specified in gan_conf.py (for synthetic data) and gan_conf_image.py (for image data).
+
+In order to run the model, first download the corresponding data by running the shell script in ./data.
+Then you can run the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
+
+$python gan_trainer.py -d cifar --use_gpu 1
+
+The generated images will be stored in ./cifar_samples/
+The corresponding models will be stored in ./cifar_params/
\ No newline at end of file
--- a/demo/gan/data/download_cifar.sh
+++ b/demo/gan/data/download_cifar.sh
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+tar zxf cifar-10-python.tar.gz
+rm cifar-10-python.tar.gz
+
--- a/demo/gan/data/get_mnist_data.sh
+++ b/demo/gan/data/get_mnist_data.sh
+#!/usr/bin/env sh
+# This script downloads the mnist data and unzips it.
+set -e
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+rm -rf "$DIR/mnist_data"
+mkdir "$DIR/mnist_data"
+cd "$DIR/mnist_data"
+
+echo "Downloading..."
+
+for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
+do
+    if [ ! -e $fname ]; then
+        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
+        gunzip ${fname}.gz
+    fi
+done
+
+
--- a/demo/gan/gan_conf.py
+++ b/demo/gan/gan_conf.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+mode = get_config_arg("mode", str, "generator")
+assert mode in set(["generator",
+                    "discriminator",
+                    "generator_training",
+                    "discriminator_training"])
+
+is_generator_training = mode == "generator_training"
+is_discriminator_training = mode == "discriminator_training"
+is_generator = mode == "generator"
+is_discriminator = mode == "discriminator"
+
+# The network structure below follows the ref https://arxiv.org/abs/1406.2661
+# Here we used two hidden layers and batch_norm
+
+print('mode=%s' % mode)
+# the dim of the noise (z) as the input of the generator network
+noise_dim = 10
+# the dim of the hidden layer
+hidden_dim = 10
+# the dim of the generated sample
+sample_dim = 2
+
+settings(
+    batch_size=128,
+    learning_rate=1e-4,
+    learning_method=AdamOptimizer(beta1=0.5)
+)
+
+def discriminator(sample):
+    """
+    discriminator ouputs the probablity of a sample is from generator
+    or real data.
+    The output has two dimenstional: dimension 0 is the probablity
+    of the sample is from generator and dimension 1 is the probabblity
+    of the sample is from real data.
+    """
+    param_attr = ParamAttr(is_static=is_generator_training)
+    bias_attr = ParamAttr(is_static=is_generator_training,
+                          initial_mean=1.0,
+                          initial_std=0)
+
+    hidden = fc_layer(input=sample, name="dis_hidden", size=hidden_dim,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=ReluActivation())
+
+    hidden2 = fc_layer(input=hidden, name="dis_hidden2", size=hidden_dim,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=LinearActivation())
+    
+    hidden_bn = batch_norm_layer(hidden2, 
+                     act=ReluActivation(), 
+                     name="dis_hidden_bn", 
+                     bias_attr=bias_attr, 
+                     param_attr=ParamAttr(is_static=is_generator_training,
+                           initial_mean=1.0,
+                           initial_std=0.02),
+                     use_global_stats=False)
+    
+    return fc_layer(input=hidden_bn, name="dis_prob", size=2,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=SoftmaxActivation())
+
+def generator(noise):
+    """
+    generator generates a sample given noise
+    """
+    param_attr = ParamAttr(is_static=is_discriminator_training)
+    bias_attr = ParamAttr(is_static=is_discriminator_training,
+                           initial_mean=1.0,
+                           initial_std=0)
+    
+    hidden = fc_layer(input=noise,
+                    name="gen_layer_hidden",
+                    size=hidden_dim,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=ReluActivation())
+
+    hidden2 = fc_layer(input=hidden, name="gen_hidden2", size=hidden_dim,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=LinearActivation())
+    
+    hidden_bn = batch_norm_layer(hidden2, 
+                     act=ReluActivation(), 
+                     name="gen_layer_hidden_bn", 
+                     bias_attr=bias_attr, 
+                     param_attr=ParamAttr(is_static=is_discriminator_training,
+                           initial_mean=1.0,
+                           initial_std=0.02),
+                     use_global_stats=False)
+    
+    return fc_layer(input=hidden_bn,
+                    name="gen_layer1",
+                    size=sample_dim,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=LinearActivation())
+
+if is_generator_training:
+    noise = data_layer(name="noise", size=noise_dim)
+    sample = generator(noise)
+
+if is_discriminator_training:
+    sample = data_layer(name="sample", size=sample_dim)
+
+if is_generator_training or is_discriminator_training:
+    label = data_layer(name="label", size=1)
+    prob = discriminator(sample)
+    cost = cross_entropy(input=prob, label=label)
+    classification_error_evaluator(input=prob, label=label, name=mode+'_error')
+    outputs(cost)
+
+if is_generator:
+    noise = data_layer(name="noise", size=noise_dim)
+    outputs(generator(noise))
--- a/demo/gan/gan_conf_image.py
+++ b/demo/gan/gan_conf_image.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+mode = get_config_arg("mode", str, "generator")
+dataSource = get_config_arg("data", str, "mnist")
+assert mode in set(["generator",
+                    "discriminator",
+                    "generator_training",
+                    "discriminator_training"])
+
+is_generator_training = mode == "generator_training"
+is_discriminator_training = mode == "discriminator_training"
+is_generator = mode == "generator"
+is_discriminator = mode == "discriminator"
+
+# The network structure below follows the dcgan paper 
+# (https://arxiv.org/abs/1511.06434)
+
+print('mode=%s' % mode)
+# the dim of the noise (z) as the input of the generator network
+noise_dim = 100
+# the number of filters in the layer in generator/discriminator that is 
+# closet to the image
+gf_dim = 64
+df_dim = 64
+if dataSource == "mnist":
+    sample_dim = 28 # image dim
+    c_dim = 1 # image color
+else:
+    sample_dim = 32
+    c_dim = 3
+s2, s4 = int(sample_dim/2), int(sample_dim/4), 
+s8, s16 = int(sample_dim/8), int(sample_dim/16)
+
+settings(
+    batch_size=128,
+    learning_rate=2e-4,
+    learning_method=AdamOptimizer(beta1=0.5)
+)
+
+def conv_bn(input, channels, imgSize, num_filters, output_x, stride, name, 
+                 param_attr, bias_attr, param_attr_bn, bn, trans=False, 
+                 act=ReluActivation()):
+    
+    """
+    conv_bn is a utility function that constructs a convolution/deconv layer 
+    with an optional batch_norm layer
+
+    :param bn: whether to use batch_norm_layer
+    :type bn: bool
+    :param trans: whether to use conv (False) or deconv (True)
+    :type trans: bool
+    """
+    
+    # calculate the filter_size and padding size based on the given
+    # imgSize and ouput size
+    tmp =  imgSize - (output_x - 1) * stride
+    if tmp <= 1 or tmp > 5:
+        raise ValueError("conv input-output dimension does not fit")
+    elif tmp <= 3:
+        filter_size = tmp + 2
+        padding = 1
+    else:
+        filter_size = tmp
+        padding = 0
+
+    print (imgSize, output_x, stride, filter_size, padding)
+    
+    if trans:
+        nameApx = "_conv"
+    else:
+        nameApx = "_convt"
+    
+    if bn:
+        conv = img_conv_layer(input, filter_size=filter_size, 
+                   num_filters=num_filters,
+                   name=name + nameApx, num_channels=channels,
+                   act=LinearActivation(), groups=1, stride=stride, 
+                   padding=padding, bias_attr=bias_attr,
+                   param_attr=param_attr, shared_biases=True, layer_attr=None,
+                   filter_size_y=None, stride_y=None, padding_y=None, 
+                   trans=trans)
+        
+        conv_bn = batch_norm_layer(conv, 
+                         act=act, 
+                         name=name + nameApx + "_bn", 
+                         bias_attr=bias_attr, 
+                         param_attr=param_attr_bn,
+                         use_global_stats=False)
+        
+        return conv_bn
+    else:
+        conv = img_conv_layer(input, filter_size=filter_size, 
+                   num_filters=num_filters,
+                   name=name + nameApx, num_channels=channels,
+                   act=act, groups=1, stride=stride, 
+                   padding=padding, bias_attr=bias_attr,
+                   param_attr=param_attr, shared_biases=True, layer_attr=None,
+                   filter_size_y=None, stride_y=None, padding_y=None,
+                   trans=trans)
+        return conv
+    
+def generator(noise):
+    """
+    generator generates a sample given noise
+    """
+    param_attr = ParamAttr(is_static=is_discriminator_training,
+                           initial_mean=0.0,
+                           initial_std=0.02)
+    bias_attr = ParamAttr(is_static=is_discriminator_training,
+                           initial_mean=0.0,
+                           initial_std=0.0)
+    
+    param_attr_bn=ParamAttr(is_static=is_discriminator_training,
+                           initial_mean=1.0,
+                           initial_std=0.02)
+    
+    h1 = fc_layer(input=noise,
+                    name="gen_layer_h1",
+                    size=s8 * s8 * gf_dim * 4,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=LinearActivation())
+    
+    h1_bn = batch_norm_layer(h1, 
+                     act=ReluActivation(), 
+                     name="gen_layer_h1_bn", 
+                     bias_attr=bias_attr, 
+                     param_attr=param_attr_bn,
+                     use_global_stats=False)
+    
+    h2_bn = conv_bn(h1_bn, 
+                    channels=gf_dim*4, 
+                    output_x=s8,
+                    num_filters=gf_dim*2, 
+                    imgSize=s4,
+                    stride=2,
+                    name="gen_layer_h2", 
+                    param_attr=param_attr, 
+                    bias_attr=bias_attr, 
+                    param_attr_bn=param_attr_bn,
+                    bn=True,
+                    trans=True)
+    
+    h3_bn = conv_bn(h2_bn, 
+                    channels=gf_dim*2, 
+                    output_x=s4,
+                    num_filters=gf_dim, 
+                    imgSize=s2,
+                    stride=2,
+                    name="gen_layer_h3", 
+                    param_attr=param_attr, 
+                    bias_attr=bias_attr, 
+                    param_attr_bn=param_attr_bn,
+                    bn=True,
+                    trans=True)
+     
+    
+    return conv_bn(h3_bn,
+                   channels=gf_dim, 
+                   output_x=s2,
+                   num_filters=c_dim, 
+                   imgSize=sample_dim,
+                   stride=2,
+                   name="gen_layer_h4", 
+                   param_attr=param_attr, 
+                   bias_attr=bias_attr, 
+                   param_attr_bn=param_attr_bn,
+                   bn=False,
+                   trans=True,
+                   act=TanhActivation())
+
+
+def discriminator(sample):
+    """
+    discriminator ouputs the probablity of a sample is from generator
+    or real data.
+    The output has two dimenstional: dimension 0 is the probablity
+    of the sample is from generator and dimension 1 is the probabblity
+    of the sample is from real data.
+    """
+    param_attr = ParamAttr(is_static=is_generator_training,
+                           initial_mean=0.0,
+                           initial_std=0.02)
+    bias_attr = ParamAttr(is_static=is_generator_training,
+                          initial_mean=0.0,
+                          initial_std=0.0)
+    
+    param_attr_bn=ParamAttr(is_static=is_generator_training,
+                           initial_mean=1.0,
+                           initial_std=0.02)
+    
+    h0 = conv_bn(sample, 
+                 channels=c_dim, 
+                 imgSize=sample_dim,
+                 num_filters=df_dim, 
+                 output_x=s2, 
+                 stride=2, 
+                 name="dis_h0", 
+                 param_attr=param_attr, 
+                 bias_attr=bias_attr, 
+                 param_attr_bn=param_attr_bn, 
+                 bn=False)
+    
+    h1_bn = conv_bn(h0, 
+                 channels=df_dim,
+                 imgSize=s2,
+                 num_filters=df_dim*2, 
+                 output_x=s4, 
+                 stride=2, 
+                 name="dis_h1", 
+                 param_attr=param_attr, 
+                 bias_attr=bias_attr, 
+                 param_attr_bn=param_attr_bn, 
+                 bn=True)
+
+    h2_bn = conv_bn(h1_bn, 
+                 channels=df_dim*2,
+                 imgSize=s4,
+                 num_filters=df_dim*4, 
+                 output_x=s8, 
+                 stride=2, 
+                 name="dis_h2", 
+                 param_attr=param_attr, 
+                 bias_attr=bias_attr, 
+                 param_attr_bn=param_attr_bn, 
+                 bn=True)
+        
+    return fc_layer(input=h2_bn, name="dis_prob", size=2,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=SoftmaxActivation())
+
+
+
+if is_generator_training:
+    noise = data_layer(name="noise", size=noise_dim)
+    sample = generator(noise)
+
+if is_discriminator_training:
+    sample = data_layer(name="sample", size=sample_dim * sample_dim*c_dim)
+
+if is_generator_training or is_discriminator_training:
+    label = data_layer(name="label", size=1)
+    prob = discriminator(sample)
+    cost = cross_entropy(input=prob, label=label)
+    classification_error_evaluator(input=prob, label=label, name=mode+'_error')
+    outputs(cost)
+
+if is_generator:
+    noise = data_layer(name="noise", size=noise_dim)
+    outputs(generator(noise))
--- a/demo/gan/gan_trainer.py
+++ b/demo/gan/gan_trainer.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import random
+import numpy
+import cPickle
+import sys,os
+from PIL import Image
+
+from paddle.trainer.config_parser import parse_config
+from paddle.trainer.config_parser import logger
+import py_paddle.swig_paddle as api
+import matplotlib.pyplot as plt
+
+def plot2DScatter(data, outputfile):
+    '''
+    Plot the data as a 2D scatter plot and save to outputfile
+    data needs to be two dimensinoal
+    '''
+    x = data[:, 0]
+    y = data[:, 1]
+    logger.info("The mean vector is %s" % numpy.mean(data, 0))
+    logger.info("The std vector is %s" % numpy.std(data, 0))
+
+    heatmap, xedges, yedges = numpy.histogram2d(x, y, bins=50)
+    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
+
+    plt.clf()
+    plt.scatter(x, y)
+    plt.savefig(outputfile, bbox_inches='tight')
+
+def CHECK_EQ(a, b):
+    assert a == b, "a=%s, b=%s" % (a, b)
+
+def copy_shared_parameters(src, dst):
+    '''
+    copy the parameters from src to dst
+    :param src: the source of the parameters
+    :type src: GradientMachine
+    :param dst: the destination of the parameters
+    :type dst: GradientMachine
+    '''
+    src_params = [src.getParameter(i)
+               for i in xrange(src.getParameterSize())]
+    src_params = dict([(p.getName(), p) for p in src_params])
+
+
+    for i in xrange(dst.getParameterSize()):
+        dst_param = dst.getParameter(i)
+        src_param = src_params.get(dst_param.getName(), None)
+        if src_param is None:
+            continue
+        src_value = src_param.getBuf(api.PARAMETER_VALUE)
+        dst_value = dst_param.getBuf(api.PARAMETER_VALUE)
+        CHECK_EQ(len(src_value), len(dst_value))
+        dst_value.copyFrom(src_value)
+        dst_param.setValueUpdated()
+        
+def print_parameters(src):
+    src_params = [src.getParameter(i)
+               for i in xrange(src.getParameterSize())]
+
+    print "***************"
+    for p in src_params:
+        print "Name is %s" % p.getName()
+        print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray()
+
+def load_mnist_data(imageFile):
+    f = open(imageFile, "rb")
+    f.read(16)
+
+    # Define number of samples for train/test
+    if "train" in imageFile:
+        n = 60000
+    else:
+        n = 10000
+    
+    data = numpy.fromfile(f, 'ubyte', count=n*28*28).reshape((n, 28*28))
+    data = data / 255.0 * 2.0 - 1.0
+
+    f.close()
+    return data.astype('float32')
+
+def load_cifar_data(cifar_path):
+    batch_size = 10000
+    data = numpy.zeros((5*batch_size, 32*32*3), dtype = "float32")
+    for i in range(1, 6):
+        file = cifar_path + "/data_batch_" + str(i)
+        fo = open(file, 'rb')
+        dict = cPickle.load(fo)
+        fo.close()
+        data[(i - 1)*batch_size:(i*batch_size), :] = dict["data"]
+    
+    data = data / 255.0 * 2.0 - 1.0
+    return data
+
+# synthesize 2-D uniform data
+def load_uniform_data():
+    data = numpy.random.rand(1000000, 2).astype('float32')
+    return data
+
+def merge(images, size):
+    if images.shape[1] == 28*28:
+        h, w, c = 28, 28, 1
+    else:
+        h, w, c = 32, 32, 3
+    img = numpy.zeros((h * size[0], w * size[1], c))
+    for idx in xrange(size[0] * size[1]):
+        i = idx % size[1]
+        j = idx // size[1]
+        img[j*h:j*h+h, i*w:i*w+w, :] = \
+          ((images[idx, :].reshape((h, w, c), order="F").transpose(1, 0, 2) + 1.0) / 2.0 * 255.0)
+    return img.astype('uint8')
+
+def save_images(images, path):
+    merged_img = merge(images, [8, 8])
+    if merged_img.shape[2] == 1:
+        im = Image.fromarray(numpy.squeeze(merged_img)).convert('RGB')
+    else:
+        im = Image.fromarray(merged_img, mode="RGB")
+    im.save(path)
+    
+def get_real_samples(batch_size, data_np):
+    return data_np[numpy.random.choice(data_np.shape[0], batch_size, 
+                                       replace=False),:]
+    
+def get_noise(batch_size, noise_dim):
+    return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')
+
+def get_fake_samples(generator_machine, batch_size, noise):
+    gen_inputs = api.Arguments.createArguments(1)
+    gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
+    gen_outputs = api.Arguments.createArguments(0)
+    generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST)
+    fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
+    return fake_samples
+
+def get_training_loss(training_machine, inputs):
+    outputs = api.Arguments.createArguments(0)
+    training_machine.forward(inputs, outputs, api.PASS_TEST)
+    loss = outputs.getSlotValue(0).copyToNumpyMat()
+    return numpy.mean(loss)
+
+def prepare_discriminator_data_batch_pos(batch_size, data_np):
+    real_samples = get_real_samples(batch_size, data_np)
+    labels = numpy.ones(batch_size, dtype='int32')
+    inputs = api.Arguments.createArguments(2)
+    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(real_samples))
+    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
+    return inputs
+
+def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
+    fake_samples = get_fake_samples(generator_machine, batch_size, noise)
+    labels = numpy.zeros(batch_size, dtype='int32')
+    inputs = api.Arguments.createArguments(2)
+    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(fake_samples))
+    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
+    return inputs
+
+def prepare_generator_data_batch(batch_size, noise):
+    label = numpy.ones(batch_size, dtype='int32')
+    inputs = api.Arguments.createArguments(2)
+    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
+    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(label))
+    return inputs
+
+
+def find(iterable, cond):
+    for item in iterable:
+        if cond(item):
+            return item
+    return None
+
+
+def get_layer_size(model_conf, layer_name):
+    layer_conf = find(model_conf.layers, lambda x: x.name == layer_name)
+    assert layer_conf is not None, "Cannot find '%s' layer" % layer_name
+    return layer_conf.size
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--data_source", help="mnist or cifar or uniform")
+    parser.add_argument("--use_gpu", default="1", 
+                        help="1 means use gpu for training")
+    parser.add_argument("--gpu_id", default="0", 
+                        help="the gpu_id parameter")
+    args = parser.parse_args()
+    data_source = args.data_source
+    use_gpu = args.use_gpu
+    assert data_source in ["mnist", "cifar", "uniform"]
+    assert use_gpu in ["0", "1"]
+
+    if not os.path.exists("./%s_samples/" % data_source):
+        os.makedirs("./%s_samples/" % data_source)
+
+    if not os.path.exists("./%s_params/" % data_source):
+        os.makedirs("./%s_params/" % data_source)
+        
+    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10', '--log_period=100', 
+                   '--gpu_id=' + args.gpu_id, '--save_dir=' + "./%s_params/" % data_source)
+    
+    if data_source == "uniform":
+        conf = "gan_conf.py"
+        num_iter = 10000
+    else:
+        conf = "gan_conf_image.py"
+        num_iter = 1000
+        
+    gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
+    dis_conf = parse_config(conf, "mode=discriminator_training,data=" + data_source)
+    generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
+    batch_size = dis_conf.opt_config.batch_size
+    noise_dim = get_layer_size(gen_conf.model_config, "noise")
+    
+    if data_source == "mnist":
+        data_np = load_mnist_data("./data/mnist_data/train-images-idx3-ubyte")
+    elif data_source == "cifar":
+        data_np = load_cifar_data("./data/cifar-10-batches-py/")
+    else:
+        data_np = load_uniform_data()
+    
+    # this creates a gradient machine for discriminator
+    dis_training_machine = api.GradientMachine.createFromConfigProto(
+        dis_conf.model_config)
+    # this create a gradient machine for generator    
+    gen_training_machine = api.GradientMachine.createFromConfigProto(
+        gen_conf.model_config)
+
+    # generator_machine is used to generate data only, which is used for
+    # training discriminator
+    logger.info(str(generator_conf.model_config))
+    generator_machine = api.GradientMachine.createFromConfigProto(
+        generator_conf.model_config)
+    
+    dis_trainer = api.Trainer.create(
+        dis_conf, dis_training_machine)
+
+    gen_trainer = api.Trainer.create(
+        gen_conf, gen_training_machine)
+    
+    dis_trainer.startTrain()
+    gen_trainer.startTrain()
+    
+    # Sync parameters between networks (GradientMachine) at the beginning
+    copy_shared_parameters(gen_training_machine, dis_training_machine)
+    copy_shared_parameters(gen_training_machine, generator_machine)
+    
+    # constrain that either discriminator or generator can not be trained
+    # consecutively more than MAX_strike times
+    curr_train = "dis"
+    curr_strike = 0
+    MAX_strike = 5
+     
+    for train_pass in xrange(100):
+        dis_trainer.startTrainPass()
+        gen_trainer.startTrainPass()
+        for i in xrange(num_iter):
+            # Do forward pass in discriminator to get the dis_loss
+            noise = get_noise(batch_size, noise_dim)
+            data_batch_dis_pos = prepare_discriminator_data_batch_pos(
+                batch_size, data_np)
+            dis_loss_pos = get_training_loss(dis_training_machine, data_batch_dis_pos)
+            
+            data_batch_dis_neg = prepare_discriminator_data_batch_neg(
+                generator_machine, batch_size, noise)
+            dis_loss_neg = get_training_loss(dis_training_machine, data_batch_dis_neg)            
+                         
+            dis_loss = (dis_loss_pos + dis_loss_neg) / 2.0
+            
+            # Do forward pass in generator to get the gen_loss
+            data_batch_gen = prepare_generator_data_batch(
+                    batch_size, noise)
+            gen_loss = get_training_loss(gen_training_machine, data_batch_gen)
+             
+            if i % 100 == 0:
+                print "d_pos_loss is %s     d_neg_loss is %s" % (dis_loss_pos, dis_loss_neg) 
+                print "d_loss is %s    g_loss is %s" % (dis_loss, gen_loss)
+            
+            # Decide which network to train based on the training history
+            # And the relative size of the loss        
+            if (not (curr_train == "dis" and curr_strike == MAX_strike)) and \
+               ((curr_train == "gen" and curr_strike == MAX_strike) or dis_loss > gen_loss):
+                if curr_train == "dis":
+                    curr_strike += 1
+                else:
+                    curr_train = "dis"
+                    curr_strike = 1                
+                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_neg)
+                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos)               
+                copy_shared_parameters(dis_training_machine, gen_training_machine)
+ 
+            else:
+                if curr_train == "gen":
+                    curr_strike += 1
+                else:
+                    curr_train = "gen"
+                    curr_strike = 1
+                gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
+                # TODO: add API for paddle to allow true parameter sharing between different GradientMachines 
+                # so that we do not need to copy shared parameters. 
+                copy_shared_parameters(gen_training_machine, dis_training_machine)
+                copy_shared_parameters(gen_training_machine, generator_machine)
+ 
+        dis_trainer.finishTrainPass()
+        gen_trainer.finishTrainPass()
+        # At the end of each pass, save the generated samples/images
+        fake_samples = get_fake_samples(generator_machine, batch_size, noise)
+        if data_source == "uniform":
+            plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" % (data_source, train_pass))
+        else:
+            save_images(fake_samples, "./%s_samples/train_pass%s.png" % (data_source, train_pass))
+    dis_trainer.finishTrain()
+    gen_trainer.finishTrain()
+
+if __name__ == '__main__':
+    main()
--- a/demo/image_classification/predict.sh
+++ b/demo/image_classification/predict.sh
--- a/demo/semantic_role_labeling/predict.sh
+++ b/demo/semantic_role_labeling/predict.sh
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
--- a/demo/seqToseq/dataprovider.py
+++ b/demo/seqToseq/dataprovider.py
@@ -19,27 +19,44 @@ START = "<s>"
 END = "<e>"


-def hook(settings, src_dict, trg_dict, file_list, **kwargs):
+def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
+         **kwargs):
    # job_mode = 1: training mode
    # job_mode = 0: generating mode
-    settings.job_mode = trg_dict is not None
-    settings.src_dict = src_dict
+    settings.job_mode = not is_generating
+    settings.src_dict = dict()
+    with open(src_dict_path, "r") as fin:
+        settings.src_dict = {
+            line.strip(): line_count
+            for line_count, line in enumerate(fin)
+        }
+    settings.trg_dict = dict()
+    with open(trg_dict_path, "r") as fin:
+        settings.trg_dict = {
+            line.strip(): line_count
+            for line_count, line in enumerate(fin)
+        }
+
    settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
    settings.sample_count = 0

    if settings.job_mode:
-        settings.trg_dict = trg_dict
-        settings.slots = [
+        settings.slots = {
+            'source_language_word':
            integer_value_sequence(len(settings.src_dict)),
+            'target_language_word':
            integer_value_sequence(len(settings.trg_dict)),
+            'target_language_next_word':
            integer_value_sequence(len(settings.trg_dict))
-        ]
+        }
        settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
    else:
-        settings.slots = [
+        settings.slots = {
+            'source_language_word':
            integer_value_sequence(len(settings.src_dict)),
+            'sent_id':
            integer_value_sequence(len(open(file_list[0], "r").readlines()))
-        ]
+        }


 def _get_ids(s, dictionary):
@@ -69,6 +86,10 @@ def process(settings, file_name):
                    continue
                trg_ids_next = trg_ids + [settings.trg_dict[END]]
                trg_ids = [settings.trg_dict[START]] + trg_ids
-                yield src_ids, trg_ids, trg_ids_next
+                yield {
+                    'source_language_word': src_ids,
+                    'target_language_word': trg_ids,
+                    'target_language_next_word': trg_ids_next
+                }
            else:
-                yield src_ids, [line_count]
+                yield {'source_language_word': src_ids, 'sent_id': [line_count]}
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -37,17 +37,10 @@ def seq_to_seq_data(data_dir,
    """
    src_lang_dict = os.path.join(data_dir, 'src.dict')
    trg_lang_dict = os.path.join(data_dir, 'trg.dict')
-    src_dict = dict()
-    for line_count, line in enumerate(open(src_lang_dict, "r")):
-        src_dict[line.strip()] = line_count
-    trg_dict = dict()
-    for line_count, line in enumerate(open(trg_lang_dict, "r")):
-        trg_dict[line.strip()] = line_count

    if is_generating:
        train_list = None
        test_list = os.path.join(data_dir, gen_list)
-        trg_dict = None
    else:
        train_list = os.path.join(data_dir, train_list)
        test_list = os.path.join(data_dir, test_list)
@@ -57,8 +50,11 @@ def seq_to_seq_data(data_dir,
        test_list,
        module="dataprovider",
        obj="process",
-        args={"src_dict": src_dict,
-              "trg_dict": trg_dict})
+        args={
+            "src_dict_path": src_lang_dict,
+            "trg_dict_path": trg_lang_dict,
+            "is_generating": is_generating
+        })

    return {
        "src_dict_path": src_lang_dict,

--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -15,25 +15,11 @@ set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
 # HTML output directory
 set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")

-
-set(PADDLE_DOXYGEN_OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/doxygen_xml")
-
 configure_file(
    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
    "${BINARY_BUILD_DIR}/conf.py"
    @ONLY)

-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in"
-    "${CMAKE_CURRENT_BINARY_DIR}/Doxyfile"
-    @ONLY
-  )
-
-add_custom_target(paddle_doxygen_docs ALL
-    ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-)
-
 sphinx_add_target(paddle_docs
                  html
                  ${BINARY_BUILD_DIR}
@@ -41,6 +27,5 @@ sphinx_add_target(paddle_docs
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR})

-add_dependencies(paddle_docs 
-  gen_proto_py
-  paddle_doxygen_docs)
+add_dependencies(paddle_docs
+  gen_proto_py)
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
-# Doxyfile 1.8.10
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "paddle"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         = 1.0.0
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          =
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           =
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = @PADDLE_DOXYGEN_OUTPUT@
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
-# characters to appear in the names of generated files. If set to NO, non-ASCII
-# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
-# U+3044.
-# The default value is: NO.
-
-ALLOW_UNICODE_NAMES    = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       =
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 2
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = YES
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# If one adds a struct or class to a group and this option is enabled, then also
-# any nested class or struct is added to the same group. By default this option
-# is disabled and one has to add nested compounds explicitly via \ingroup.
-# The default value is: NO.
-
-GROUP_NESTED_COMPOUNDS = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = NO
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
-# append additional text to a page's title, such as Class Reference. If set to
-# YES the compound reference will be hidden.
-# The default value is: NO.
-
-HIDE_COMPOUND_REFERENCE= NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = NO
-
-# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
-# grouped member an include statement to the documentation, telling the reader
-# which file to include in order to use the member.
-# The default value is: NO.
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = NO
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = @PROJ_ROOT@/paddle
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
-
-FILE_PATTERNS          = *.c *.cc *.cpp *.cu *.h *.hpp *.cuh *.ph
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                = 
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       = */x86_64-scm-linux-gnu/* */internals/* */mkl/* */test/* */tests/* */platform/*
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       =
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE =
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = NO
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. The package can be specified just
-# by its name or with the correct syntax as to be used with the LaTeX
-# \usepackage command. To get the times font for instance you can specify :
-# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
-# To use the option intlimits with the amsmath package you can specify:
-# EXTRA_PACKAGES=[intlimits]{amsmath}
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
-# LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# LaTeX style sheets that are included after the standard style sheets created
-# by doxygen. Using this option one can overrule certain style aspects. Doxygen
-# will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_STYLESHEET =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
-# MAN_EXTENSION with the initial . removed.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_SUBDIR             =
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = YES
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO, the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             =
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           =
-
-# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
-
-PLANTUML_JAR_PATH      =
-
-# When using plantuml, the specified paths are searched for files specified by
-# the !include statement in a plantuml block.
-
-PLANTUML_INCLUDE_PATH  =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
--- a/doc/conf.py.in
+++ b/doc/conf.py.in
@@ -23,21 +23,7 @@ AutoStructify = transform.AutoStructify
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')

-templates_path = ["@PROJ_ROOT@/doc/templates"]
-
-# -- Doxygen Settings
-breathe_projects = {
-   'paddle': '@PADDLE_DOXYGEN_OUTPUT@/xml'
-}
-breathe_default_project = 'paddle'
-
-breathe_domain_by_extension = {
-    'h': 'cpp',  # mapping XXX.h XXX.cpp together
-}
-
-breathe_default_members = {
-    'protected-members','undoc-members'
-}
+templates_path = ["@PROJ_ROOT@/doc_theme/templates"]

 # -- General configuration ------------------------------------------------

@@ -62,7 +48,6 @@ extensions = [
    'sphinx.ext.autosummary',
    'sphinx.ext.mathjax',
    'sphinx.ext.napoleon',
-    'breathe'
 ]


@@ -128,13 +113,12 @@ todo_include_todos = False

 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-#html_theme = 'sphinx_rtd_theme'
-html_theme = 'classic'
+html_theme = 'sphinx_rtd_theme'

 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ['@PROJ_ROOT@/doc_theme/static']

 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'

--- a/doc/getstarted/build_and_install/docker_install.rst
+++ b/doc/getstarted/build_and_install/docker_install.rst
-Docker installation guide
-==========================
+PaddlePaddle in Docker Containers
+=================================

-PaddlePaddle provide the `Docker <https://www.docker.com/>`_ image. `Docker`_ is a lightweight container utilities. The performance of PaddlePaddle in `Docker`_ container is basically as same as run it in a normal linux. The `Docker`_ is a very convenient way to deliver the binary release for linux programs.
+Docker container is currently the only officially-supported way to
+running PaddlePaddle.  This is reasonable as Docker now runs on all
+major operating systems including Linux, Mac OS X, and Windows.
+Please be aware that you will need to change `Dockers settings
+<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
+of your hardware resource on Mac OS X and Windows.

-..  note::

-    The `Docker`_ image is the recommended way to run PaddlePaddle 
+CPU-only and GPU Images
+-----------------------

-PaddlePaddle Docker images
--------------------------
+For each version of PaddlePaddle, we release 2 Docker images, a
+CPU-only one and a CUDA GPU one.  We do so by configuring
+`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
+automatically runs the following commands:

-There are 12 `images <https://hub.docker.com/r/paddledev/paddle/tags/>`_ for PaddlePaddle, and the name is :code:`paddle-dev/paddle`,  tags are\: 
+.. code-block:: base

+   docker build -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .

-+-----------------+------------------+------------------------+-----------------------+
-|                 |   normal         |           devel        |          demo         |
-+=================+==================+========================+=======================+
-|       CPU       | cpu-latest       | cpu-devel-latest       | cpu-demo-latest       |
-+-----------------+------------------+------------------------+-----------------------+
-|       GPU       | gpu-latest       | gpu-devel-latest       | gpu-demo-latest       |
-+-----------------+------------------+------------------------+-----------------------+
-| CPU WITHOUT AVX | cpu-noavx-latest | cpu-devel-noavx-latest | cpu-demo-noavx-latest |
-+-----------------+------------------+------------------------+-----------------------+
-| GPU WITHOUT AVX | gpu-noavx-latest | gpu-devel-noavx-latest | gpu-demo-noavx-latest |
-+-----------------+------------------+------------------------+-----------------------+

-And the three columns are:
+To run the CPU-only image as an interactive container:

-* normal\: The docker image only contains binary of PaddlePaddle.
-* devel\: The docker image contains PaddlePaddle binary, source code and essential build environment.
-* demo\: The docker image contains the dependencies to run PaddlePaddle demo.
+.. code-block:: bash

-And the four rows are:
+    docker run -it --rm paddledev/paddle:cpu-latest /bin/bash

-* CPU\: CPU Version. Support CPU which has :code:`AVX` instructions.
-* GPU\: GPU Version. Support GPU, and cpu has :code:`AVX` instructions.
-* CPU WITHOUT AVX\: CPU Version, which support most CPU even doesn't have :code:`AVX` instructions.
-* GPU WITHOUT AVX\: GPU Version, which support most CPU even doesn't have :code:`AVX` instructions.
+or, we can run it as a daemon container

-User can choose any version depends on machine. The following script can help you to detect your CPU support :code:`AVX` or not.
+.. code-block:: bash

-..  code-block:: bash
-    
-    if cat /proc/cpuinfo | grep -q avx ; then echo "Support AVX"; else echo "Not support AVX"; fi
+    docker run -d -p 2202:22 paddledev/paddle:cpu-latest

-If the output is :code:`Support AVX`, then you can choose the AVX version of PaddlePaddle, otherwise, you need select :code:`noavx` version of PaddlePaddle. For example, the CPU develop version of PaddlePaddle is :code:`paddle-dev/paddle:cpu-devel-latest`.
+and SSH to this container using password :code:`root`:

-The PaddlePaddle images don't contain any entry command. You need to write your entry command to use this image. See :code:`Remote Access` part or just use following command to run a :code:`bash`
+.. code-block:: bash

-..  code-block:: bash
+    ssh -p 2202 root@localhost

-    docker run -it paddledev/paddle:cpu-latest /bin/bash
+An advantage of using SSH is that we can connect to PaddlePaddle from
+more than one terminals.  For example, one terminal running vi and
+another one running Python interpreter.  Another advantage is that we
+can run the PaddlePaddle container on a remote server and SSH to it
+from a laptop.


-Download and Run Docker images
------------------------------
+Above methods work with the GPU image too -- just please don't forget
+to install CUDA driver and let Docker knows about it:

-You have to install Docker in your machine which has linux kernel version 3.10+ first. You can refer to the official guide https://docs.docker.com/engine/installation/ for further information.
-
-You can use :code:`docker pull ` to download images first, or just launch a container with :code:`docker run` \:
-
-..  code-block:: bash
-
-    docker run -it paddledev/paddle:cpu-latest
-
-
-If you want to launch container with GPU support, you need to set some environment variables at the same time:
-
-..  code-block:: bash
+.. code-block:: bash

    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest


-Some notes for docker
---------------------
-
-Performance
-+++++++++++
-
-Since Docker is based on the lightweight virtual containers, the CPU computing performance maintains well. And GPU driver and equipments are all mapped to the container, so the GPU computing performance would not be seriously affected.
-
-If you use high performance nic, such as RDMA(RoCE 40GbE or IB 56GbE), Ethernet(10GbE), it is recommended to use config "-net = host".
-
-
-
-
-Remote access
-+++++++++++++
-
-
-If you want to enable ssh access background, you need to build an image by yourself. Please refer to official guide https://docs.docker.com/engine/reference/builder/ for further information.
-
-Following is a simple Dockerfile with ssh:
-
-..  literalinclude:: ../../doc_cn/build_and_install/install/paddle_ssh.Dockerfile
-
-Then you can build an image with Dockerfile and launch a container:
-
-..  code-block:: bash
+Non-AVX Images
+--------------

-    # cd into Dockerfile directory
-    docker build . -t paddle_ssh
-    # run container, and map host machine port 8022 to container port 22
-    docker run -d -p 8022:22 --name paddle_ssh_machine paddle_ssh
+Please be aware that the CPU-only and the GPU images both use the AVX
+instruction set, but old computers produced before 2008 do not support
+AVX.  The following command checks if your Linux computer supports
+AVX:

-Now, you can ssh on port 8022 to access the container, username is root, password is also root:
+.. code-block:: bash

-..  code-block:: bash
+   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi

-    ssh -p 8022 root@YOUR_HOST_MACHINE

-You can stop and delete the container as following:
+If it doesn't, we will need to build non-AVX images manually from
+source code:

-..  code-block:: bash
+.. code-block:: bash

-    # stop
-    docker stop paddle_ssh_machine
-    # delete
-    docker rm paddle_ssh_machine
+   cd ~
+   git clone github.com/PaddlePaddle/Paddle
+   cd Paddle
+   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
--- a/doc/howto/cmd_parameter/arguments.md
+++ b/doc/howto/cmd_parameter/arguments.md
@@ -143,7 +143,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
 </tr>

 <tr>
-<td class="left" rowspan = "2">testing during training</td><td class="left">test_all_data_in_one_period</td>
+<td class="left" rowspan = "2">testing during training</td><td class="left">test_period</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>


--- a/doc/howto/cmd_parameter/detail_introduction.md
+++ b/doc/howto/cmd_parameter/detail_introduction.md
@@ -31,7 +31,7 @@
  - type: string (default: null).

 * `--version`
-  - Whether to print version infomatrion.
+  - Whether to print version information.
  - type: bool (default: 0).

 * `--show_layer_stat`
@@ -110,8 +110,8 @@
  - type: int32 (default: -1).

 * `--test_period`
-  - Run testing every test_period train batches. If not set, run testing each pass.
-  - type: int32 (default: 1000).
+   - if equal 0, do test on all test data at the end of each pass. While if equal non-zero, do test on all test data every test_period batches.
+  - type: int32 (default: 0).

 * `--test_wait`
  - Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
@@ -121,10 +121,6 @@
  - File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
  - type: string (default: "", null).

-* `--test_all_data_in_one_period`
-  - This argument is usually used in testing period during traning. If true, all data will be tested in one test period. Otherwise (batch_size * log_peroid) data will be tested.
-  - type: bool (default: 0).
-
 * `--predict_output_dir`
  - Directory that saves the layer output. It is configured in Outputs() in network config. Default, this argument is null, meaning save nothing. Specify this directory if you want to save feature map of some layers in testing mode. Note that, layer outputs are values after activation function.
  - type: string (default: "", null).

--- a/doc/howto/cmd_parameter/use_case.md
+++ b/doc/howto/cmd_parameter/use_case.md
@@ -10,9 +10,8 @@ paddle train \
  --config=network_config \
  --save_dir=output \
  --trainer_count=COUNT \                #(default:1)
-  --test_period=M \                      #(default:1000）
-  --test_all_data_in_one_period=true \   #(default:false) 
-  --num_passes=N \                       #(defalut:100）
+  --test_period=M \                      #(default:0) 
+  --num_passes=N \                       #(defalut:100)
  --log_period=K \                       #(default:100)
  --dot_period=1000 \                    #(default:1)
  #[--show_parameter_stats_period=100] \ #(default:0)

--- a/doc_cn/build/docker/build_docker_image.rst
+++ b/doc_cn/build/docker/build_docker_image.rst
-构建PaddlePaddle Docker Image
-===========================
-
-PaddlePaddle的Docker Image构建源码放置在 :code:`${源码根目录}/paddle/scripts/docker/`目录下。
-该Image基于ubuntu 14.04。该目录下有两个文件，Dockerfile和build.sh。其中:
-
-*  Dockerfile是docker image的主要描述文件。描述了Docker image的构建步骤、各种参数和维护
-   人员等等。
-*  build.sh是docker image的主要构建步骤。
-
-该image的构建在docker 1.12版本测试通过, 低于docker 1.12版本的情况下并没有测试。主要由于旧版本
-的docker可能缺乏 :code:`--build-arg` 参数，从而不能在运行编译命令的时候接受参数。
-
-同时，该构建脚本充分考虑了网络不稳定的情况，对于cuda的Toolkit有断点续传和传输速度过小重启下载的
-简单优化。
-
-使用脚本构建PaddlePaddle Docker Image
-------------------------------------------
-
-该脚本的使用方法是，进入该源码目录，执行 :code:`docker build .` 命令。可以使用
- :code:`--build-arg` 传入的配置参数包括:
-
-*  LOWEST\_DL\_SPEED\: 多线程下载过程中，最低线程的下载速度(默认单位是Bytes，可以传入10K, 
-   10M，或者10G这样的单位)。如果小于这个下载速度，那么这个下载线程将会关闭。所有的下载线程关闭时，
-   下载进程会重启。
-*  WITH\_GPU\: ON or OFF。是否开启GPU功能。注意，编译PaddlePaddle的GPU版本并不需要一定在具有GPU
-   的机器上进行。但是，运行PaddlePaddle的GPU版本一定要在具有CUDA的机器上运行。
-
-简单的使用样例为\:
-
-..  code-block:: bash
-
-    cd ${源码根目录}/paddle/scripts/docker/
-    docker build --build-arg LOWEST_DL_SPEED=50K\
-                 --build-arg WITH_GPU=ON \
-                 --tag  paddle_gpu:latest .
-
-即可在本地编译出PaddlePaddle的镜像。
--- a/doc_cn/conf.py.in
+++ b/doc_cn/conf.py.in
@@ -22,7 +22,7 @@ AutoStructify = transform.AutoStructify
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
-templates_path = ["@PROJ_ROOT@/doc/templates"]
+templates_path = ["@PROJ_ROOT@/doc_theme/templates"]

 # -- General configuration ------------------------------------------------

@@ -112,12 +112,12 @@ todo_include_todos = False

 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-#html_theme = 'sphinx_rtd_theme'  # sphinx_rtd_theme will cause table bad style
-html_theme = 'classic'
+html_theme = 'sphinx_rtd_theme'
+
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ['@PROJ_ROOT@/doc_theme/static']

 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'

--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
@@ -202,3 +202,53 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 解决办法是：

 * 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
+
+9. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
+----------------------------------------------------------
+
+这是目前CMake寻找Python的逻辑存在缺陷，如果系统安装了多个Python版本，CMake找到的Python库和Python解释器版本可能有不一致现象，导致编译PaddlePaddle失败。正确的解决方法是，
+用户强制指定特定的Python版本，具体操作如下：
+
+    ..  code-block:: bash
+        
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
+
+10. A protocol message was rejected because it was too big
+----------------------------------------------------------
+
+如果在训练NLP相关模型时，出现以下错误：
+
+..  code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) 
+
+可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：
+
+..  code-block:: python
+
+     src_dict = dict()
+     for line_count, line in enumerate(open(src_dict_path, "r")):
+        src_dict[line.strip()] = line_count
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict": src_dict})
+
+解决方案是：将字典的地址作为args传给dataprovider，然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为：
+
+..  code-block:: python
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict_path": src_dict_path})
+
+完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
\ No newline at end of file
--- a/doc_cn/howto/build_docker_image.rst
+++ b/doc_cn/howto/build_docker_image.rst
+构建PaddlePaddle的Docker Image
+==============================
+PaddlePaddle的Docker Image构建源码放置在 ``${源码根目录}/paddle/scripts/docker/`` 目录下。该目录有三类文件：
+
+- Dockerfile：Docker Image的描述文件，包括构建步骤、各种参数和维护人员等。
+  
+  - 一共维护了12个Dockerfile，Dockerfile.m4是它们的模板。
+  - PaddlePaddle中所有的Image都基于ubuntu 14.04。
+
+- build.sh：Docker Image的构建脚本，使用方式见下一小节。
+- generate.sh：通过Dockerfile.m4模板生成不同的Dockerfile。
+
+使用脚本构建Docker Image
+------------------------
+
+进入源码目录，执行 ``docker build`` 命令，即可在本地编译出PaddlePaddle的镜像。简单的使用样例为
+
+..  code-block:: bash
+
+    cd ${源码根目录}/paddle/scripts/docker/
+    docker build --build-arg LOWEST_DL_SPEED=50K \
+                 --build-arg WITH_GPU=ON \
+                 --tag  paddle_gpu:latest .
+
+其中，``--build-arg`` 传入的配置参数包括:
+
+- LOWEST\_DL\_SPEED\: 在多线程下载过程中，设置下载线程的最低速度。
+
+  - 默认单位是Bytes，但可以传入10K、10M、或10G等这样的单位。
+  - 如果小于这个速度，那么这个线程将会关闭。当所有的线程都关闭了，那么下载进程将会重启。
+-  WITH\_GPU\: ON or OFF，是否开启GPU功能。注意，
+  - **编译** PaddlePaddle的GPU版本 **不一定** 要在具有GPU的机器上进行。
+  - **运行** PaddlePaddle的GPU版本 **一定** 要在具有GPU的机器上运行。
+
+注意：所有Image的构建在Docker 1.12版本测试通过, 低于1.12的版本并没有测试。原因是旧版本可能缺乏 ``--build-arg`` 参数，从而不能在运行编译命令的时候接受参数。
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -16,6 +16,7 @@ PaddlePaddle文档
 --------
 * `新写Layer <../doc/dev/new_layer/index.html>`_
 * `如何贡献文档 <howto/how_to_write_docs/index.html>`_
+* `如何构建Docker Image <howto/build_docker_image.html>`_

 算法教程
 --------

--- a/doc_theme/static/css/override.css
+++ b/doc_theme/static/css/override.css
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+body {
+    padding-top: 80px;
+    background-image: none !important;
+    font-family: Roboto;
+}
+a, a:focus, a:hover, a:visited {
+    color: #597cf1;
+}
+.site-header {
+    position: fixed;
+    top: 0;
+    width: 100%;
+    left: 0;
+    z-index: 99;
+    background: #333;
+    height: 80px;
+    display: -webkit-flex;
+    display: -ms-flex;
+    display: -o-flex;
+    display: flex;
+    flex-flow: row nowrap;
+    justify-content: space-between;
+    box-shadow: #ccc 0 3px 3px;
+}
+.site-header > div {
+    height: 80px;
+    display: inline-block;
+    background-color: #2f323a;
+    padding: 0 30px;
+}
+.site-header .site-logo {
+    line-height: 80px;
+    width: 290px;
+    flex: 0 1 290px;
+}
+.site-header .site-logo > a {
+    display: inline-block;
+    width: 230px;
+}
+.site-header .site-nav-links {
+    flex: 0 1 100%;
+}
+.site-header .site-nav-links .site-menu {
+    height: 30px;
+    line-height: 30px; 
+    font-size: 12px;
+    background: -webkit-linear-gradient(#282b33, #2f323a);
+    background: -o-linear-gradient(#282b33, #2f323a);
+    background: -moz-linear-gradient(#282b33, #2f323a);
+    background: linear-gradient(to left, #282b33, #2f323a);
+    margin-right: -30px;
+    padding-right: 30px;
+}
+.site-header .site-nav-links .site-menu .site-page-links {
+    display: inline-block;
+    float: right;
+    margin-right: 20px;
+}
+.site-header .site-nav-links .site-menu .site-page-links> li {
+    display: inline-block;
+    float: left;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li > a {
+    color: #a7adbd;
+    display: inline-block;
+    height: 30px;
+    padding: 0 20px;
+    font-size: 12px;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li:hover > a,
+.site-header .site-nav-links .site-menu .site-page-links > li.active > a {
+    background-color: #2f323a;
+    color: #bcc1d0;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li.active > a {
+    font-weight: bold;
+}
+.site-header .site-nav-links .site-menu .fork-on-github {
+    color: #597cf1;
+    line-height: 30px;
+    display: inline-block;
+    padding: 0 0 0 20px;
+    float: right;
+    position: relative;
+}
+.site-header .site-nav-links .site-menu .fork-on-github .fa {
+    margin-right: 5px;
+    font-size: 16px;
+    vertical-align: middle;
+}
+.site-header .site-nav-links .site-menu .language-switcher {
+    height: 30px;
+    display: inline-block;
+    float: right;
+    line-height: 30px;
+    padding: 0 20px;
+    position: relative;
+}
+.site-header .site-nav-links .site-menu .language-switcher > a {
+    color: #a7adbd;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open > a {
+    background-color: #24272f;
+    color: #bcc1d0;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa {
+    margin-left: 5px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa-angle-down {
+    display: inline;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .fa-angle-down {
+    display: none;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa-angle-up {
+    display: none;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .fa-angle-up {
+    display: inline;
+}
+.site-header .site-nav-links .site-menu .fork-on-github:before,
+.site-header .site-nav-links .site-menu .language-switcher:before {
+    width: 1px;
+    height: 12px;
+    top: 9px;
+    background-color: #3a3d47;
+    left: 0;
+    display: inline-block;
+    position: absolute;
+    content: "";
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu {
+    display: none;
+    position: absolute;
+    box-shadow: #ccc 0 0 5px;
+    background-color: #fff;
+    width: 100%;
+    left: 0;
+    top: 30px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li {
+    line-height: 30px;
+    padding: 0 20px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li:hover {
+    background-color: #f7f8fe;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li + li {
+    border-top: 1px solid #dedfe5;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li > a {
+    color: #2f323a;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .dropdown-menu {
+    display: inline-block;
+}
+.site-header .site-nav-links .doc-module {
+    display: block;
+    height: 50px;
+    line-height: 50px;
+}
+.site-header .site-nav-links .doc-module > ul > li {
+    display: inline-block;
+    float: left;
+}
+.site-header .site-nav-links .doc-module > ul > li > a {
+    color: #c9cbd0;
+    font-size: 14px;
+    display: inline-block;
+    height: 50px;
+    line-height: 50px;
+    border-bottom: 2px solid transparent;
+    padding: 0 20px;
+}
+.site-header .site-nav-links .doc-module > ul > li:hover > a {
+    color: #fff;
+}
+.site-header .site-nav-links .doc-module > ul > li.current > a {
+    border-bottom-color: #fff;
+    color: #fff;
+}
+.site-header .site-nav-links .doc-module [role="search"]{
+    float: right;
+}
+.site-header .site-nav-links .doc-module [role="search"] input {
+    background-color: #3a3d47;
+    border-radius: 15px;
+    color: #a7adbd;
+    border: 1px solid transparent;
+    padding: 6px 15px;
+    width: 180px;
+    box-shadow: none;
+    transition: all .2s;
+    -webkit-transition: all .2s;
+    -moz-transition: all .2s;
+    -o-transition: all .2s;
+    background-repeat: no-repeat;
+    background-position: 145px center;
+    background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAKTWlDQ1BQaG90b3Nob3AgSUNDIHByb2ZpbGUAAHjanVN3WJP3Fj7f92UPVkLY8LGXbIEAIiOsCMgQWaIQkgBhhBASQMWFiApWFBURnEhVxILVCkidiOKgKLhnQYqIWotVXDjuH9yntX167+3t+9f7vOec5/zOec8PgBESJpHmomoAOVKFPDrYH49PSMTJvYACFUjgBCAQ5svCZwXFAADwA3l4fnSwP/wBr28AAgBw1S4kEsfh/4O6UCZXACCRAOAiEucLAZBSAMguVMgUAMgYALBTs2QKAJQAAGx5fEIiAKoNAOz0ST4FANipk9wXANiiHKkIAI0BAJkoRyQCQLsAYFWBUiwCwMIAoKxAIi4EwK4BgFm2MkcCgL0FAHaOWJAPQGAAgJlCLMwAIDgCAEMeE80DIEwDoDDSv+CpX3CFuEgBAMDLlc2XS9IzFLiV0Bp38vDg4iHiwmyxQmEXKRBmCeQinJebIxNI5wNMzgwAABr50cH+OD+Q5+bk4eZm52zv9MWi/mvwbyI+IfHf/ryMAgQAEE7P79pf5eXWA3DHAbB1v2upWwDaVgBo3/ldM9sJoFoK0Hr5i3k4/EAenqFQyDwdHAoLC+0lYqG9MOOLPv8z4W/gi372/EAe/tt68ABxmkCZrcCjg/1xYW52rlKO58sEQjFu9+cj/seFf/2OKdHiNLFcLBWK8ViJuFAiTcd5uVKRRCHJleIS6X8y8R+W/QmTdw0ArIZPwE62B7XLbMB+7gECiw5Y0nYAQH7zLYwaC5EAEGc0Mnn3AACTv/mPQCsBAM2XpOMAALzoGFyolBdMxggAAESggSqwQQcMwRSswA6cwR28wBcCYQZEQAwkwDwQQgbkgBwKoRiWQRlUwDrYBLWwAxqgEZrhELTBMTgN5+ASXIHrcBcGYBiewhi8hgkEQcgIE2EhOogRYo7YIs4IF5mOBCJhSDSSgKQg6YgUUSLFyHKkAqlCapFdSCPyLXIUOY1cQPqQ28ggMor8irxHMZSBslED1AJ1QLmoHxqKxqBz0XQ0D12AlqJr0Rq0Hj2AtqKn0UvodXQAfYqOY4DRMQ5mjNlhXIyHRWCJWBomxxZj5Vg1Vo81Yx1YN3YVG8CeYe8IJAKLgBPsCF6EEMJsgpCQR1hMWEOoJewjtBK6CFcJg4Qxwicik6hPtCV6EvnEeGI6sZBYRqwm7iEeIZ4lXicOE1+TSCQOyZLkTgohJZAySQtJa0jbSC2kU6Q+0hBpnEwm65Btyd7kCLKArCCXkbeQD5BPkvvJw+S3FDrFiOJMCaIkUqSUEko1ZT/lBKWfMkKZoKpRzame1AiqiDqfWkltoHZQL1OHqRM0dZolzZsWQ8ukLaPV0JppZ2n3aC/pdLoJ3YMeRZfQl9Jr6Afp5+mD9HcMDYYNg8dIYigZaxl7GacYtxkvmUymBdOXmchUMNcyG5lnmA+Yb1VYKvYqfBWRyhKVOpVWlX6V56pUVXNVP9V5qgtUq1UPq15WfaZGVbNQ46kJ1Bar1akdVbupNq7OUndSj1DPUV+jvl/9gvpjDbKGhUaghkijVGO3xhmNIRbGMmXxWELWclYD6yxrmE1iW7L57Ex2Bfsbdi97TFNDc6pmrGaRZp3mcc0BDsax4PA52ZxKziHODc57LQMtPy2x1mqtZq1+rTfaetq+2mLtcu0W7eva73VwnUCdLJ31Om0693UJuja6UbqFutt1z+o+02PreekJ9cr1Dund0Uf1bfSj9Rfq79bv0R83MDQINpAZbDE4Y/DMkGPoa5hpuNHwhOGoEctoupHEaKPRSaMnuCbuh2fjNXgXPmasbxxirDTeZdxrPGFiaTLbpMSkxeS+Kc2Ua5pmutG003TMzMgs3KzYrMnsjjnVnGueYb7ZvNv8jYWlRZzFSos2i8eW2pZ8ywWWTZb3rJhWPlZ5VvVW16xJ1lzrLOtt1ldsUBtXmwybOpvLtqitm63Edptt3xTiFI8p0in1U27aMez87ArsmuwG7Tn2YfYl9m32zx3MHBId1jt0O3xydHXMdmxwvOuk4TTDqcSpw+lXZxtnoXOd8zUXpkuQyxKXdpcXU22niqdun3rLleUa7rrStdP1o5u7m9yt2W3U3cw9xX2r+00umxvJXcM970H08PdY4nHM452nm6fC85DnL152Xlle+70eT7OcJp7WMG3I28Rb4L3Le2A6Pj1l+s7pAz7GPgKfep+Hvqa+It89viN+1n6Zfgf8nvs7+sv9j/i/4XnyFvFOBWABwQHlAb2BGoGzA2sDHwSZBKUHNQWNBbsGLww+FUIMCQ1ZH3KTb8AX8hv5YzPcZyya0RXKCJ0VWhv6MMwmTB7WEY6GzwjfEH5vpvlM6cy2CIjgR2yIuB9pGZkX+X0UKSoyqi7qUbRTdHF09yzWrORZ+2e9jvGPqYy5O9tqtnJ2Z6xqbFJsY+ybuIC4qriBeIf4RfGXEnQTJAntieTE2MQ9ieNzAudsmjOc5JpUlnRjruXcorkX5unOy553PFk1WZB8OIWYEpeyP+WDIEJQLxhP5aduTR0T8oSbhU9FvqKNolGxt7hKPJLmnVaV9jjdO31D+miGT0Z1xjMJT1IreZEZkrkj801WRNberM/ZcdktOZSclJyjUg1plrQr1zC3KLdPZisrkw3keeZtyhuTh8r35CP5c/PbFWyFTNGjtFKuUA4WTC+oK3hbGFt4uEi9SFrUM99m/ur5IwuCFny9kLBQuLCz2Lh4WfHgIr9FuxYji1MXdy4xXVK6ZHhp8NJ9y2jLspb9UOJYUlXyannc8o5Sg9KlpUMrglc0lamUycturvRauWMVYZVkVe9ql9VbVn8qF5VfrHCsqK74sEa45uJXTl/VfPV5bdra3kq3yu3rSOuk626s91m/r0q9akHV0IbwDa0b8Y3lG19tSt50oXpq9Y7NtM3KzQM1YTXtW8y2rNvyoTaj9nqdf13LVv2tq7e+2Sba1r/dd3vzDoMdFTve75TsvLUreFdrvUV99W7S7oLdjxpiG7q/5n7duEd3T8Wej3ulewf2Re/ranRvbNyvv7+yCW1SNo0eSDpw5ZuAb9qb7Zp3tXBaKg7CQeXBJ9+mfHvjUOihzsPcw83fmX+39QjrSHkr0jq/dawto22gPaG97+iMo50dXh1Hvrf/fu8x42N1xzWPV56gnSg98fnkgpPjp2Snnp1OPz3Umdx590z8mWtdUV29Z0PPnj8XdO5Mt1/3yfPe549d8Lxw9CL3Ytslt0utPa49R35w/eFIr1tv62X3y+1XPK509E3rO9Hv03/6asDVc9f41y5dn3m978bsG7duJt0cuCW69fh29u0XdwruTNxdeo94r/y+2v3qB/oP6n+0/rFlwG3g+GDAYM/DWQ/vDgmHnv6U/9OH4dJHzEfVI0YjjY+dHx8bDRq98mTOk+GnsqcTz8p+Vv9563Or59/94vtLz1j82PAL+YvPv655qfNy76uprzrHI8cfvM55PfGm/K3O233vuO+638e9H5ko/ED+UPPR+mPHp9BP9z7nfP78L/eE8/sl0p8zAAAAIGNIUk0AAHolAACAgwAA+f8AAIDpAAB1MAAA6mAAADqYAAAXb5JfxUYAAAEpSURBVHjanNO7K8dhFMfx1w8LBqVM5DLxF7hMTGSQpAwmJSkDizAZLSb5Ayi3clsMFgwWISGXkoSyGYRSym15fvr27duvH5/leTqd8+6c83ye1NLatohqMIgWVOEV+5jDAr7ElBO5j+IIH+hBJRqwjDHsoTQOyAvnCPpRi4tYziVmMY2dkPMc7aAG42hPKE7rAwMBNhEfYQgzOJNZ3xhGL4qigGasyk43OEdjFFCGe9nrNtT8Al5Q8AdAMd6jgFPU/QFwiN0oYD4sJzdLwBiuo4A5vGEKqQyF1ahPcuInOsJrrKMiwWx9OMAWWpOc+BD2MImr4Ik7FIb4AzqRH6zdhU1IxT4TlKAJ5XjCMU6CkaANi2lIXsKsj1jJsIsNdKc7yfE/pSGTPwMABBFCGflm+rsAAAAASUVORK5CYII=");
+}
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+   width: 300px;
+}
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+    background-position: 265px center;
+}
+.site-header .site-nav-links .doc-module [role="search"] input:hover,
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+   color: #fff;
+   border-color: #597cf1;
+   background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAKTWlDQ1BQaG90b3Nob3AgSUNDIHByb2ZpbGUAAHjanVN3WJP3Fj7f92UPVkLY8LGXbIEAIiOsCMgQWaIQkgBhhBASQMWFiApWFBURnEhVxILVCkidiOKgKLhnQYqIWotVXDjuH9yntX167+3t+9f7vOec5/zOec8PgBESJpHmomoAOVKFPDrYH49PSMTJvYACFUjgBCAQ5svCZwXFAADwA3l4fnSwP/wBr28AAgBw1S4kEsfh/4O6UCZXACCRAOAiEucLAZBSAMguVMgUAMgYALBTs2QKAJQAAGx5fEIiAKoNAOz0ST4FANipk9wXANiiHKkIAI0BAJkoRyQCQLsAYFWBUiwCwMIAoKxAIi4EwK4BgFm2MkcCgL0FAHaOWJAPQGAAgJlCLMwAIDgCAEMeE80DIEwDoDDSv+CpX3CFuEgBAMDLlc2XS9IzFLiV0Bp38vDg4iHiwmyxQmEXKRBmCeQinJebIxNI5wNMzgwAABr50cH+OD+Q5+bk4eZm52zv9MWi/mvwbyI+IfHf/ryMAgQAEE7P79pf5eXWA3DHAbB1v2upWwDaVgBo3/ldM9sJoFoK0Hr5i3k4/EAenqFQyDwdHAoLC+0lYqG9MOOLPv8z4W/gi372/EAe/tt68ABxmkCZrcCjg/1xYW52rlKO58sEQjFu9+cj/seFf/2OKdHiNLFcLBWK8ViJuFAiTcd5uVKRRCHJleIS6X8y8R+W/QmTdw0ArIZPwE62B7XLbMB+7gECiw5Y0nYAQH7zLYwaC5EAEGc0Mnn3AACTv/mPQCsBAM2XpOMAALzoGFyolBdMxggAAESggSqwQQcMwRSswA6cwR28wBcCYQZEQAwkwDwQQgbkgBwKoRiWQRlUwDrYBLWwAxqgEZrhELTBMTgN5+ASXIHrcBcGYBiewhi8hgkEQcgIE2EhOogRYo7YIs4IF5mOBCJhSDSSgKQg6YgUUSLFyHKkAqlCapFdSCPyLXIUOY1cQPqQ28ggMor8irxHMZSBslED1AJ1QLmoHxqKxqBz0XQ0D12AlqJr0Rq0Hj2AtqKn0UvodXQAfYqOY4DRMQ5mjNlhXIyHRWCJWBomxxZj5Vg1Vo81Yx1YN3YVG8CeYe8IJAKLgBPsCF6EEMJsgpCQR1hMWEOoJewjtBK6CFcJg4Qxwicik6hPtCV6EvnEeGI6sZBYRqwm7iEeIZ4lXicOE1+TSCQOyZLkTgohJZAySQtJa0jbSC2kU6Q+0hBpnEwm65Btyd7kCLKArCCXkbeQD5BPkvvJw+S3FDrFiOJMCaIkUqSUEko1ZT/lBKWfMkKZoKpRzame1AiqiDqfWkltoHZQL1OHqRM0dZolzZsWQ8ukLaPV0JppZ2n3aC/pdLoJ3YMeRZfQl9Jr6Afp5+mD9HcMDYYNg8dIYigZaxl7GacYtxkvmUymBdOXmchUMNcyG5lnmA+Yb1VYKvYqfBWRyhKVOpVWlX6V56pUVXNVP9V5qgtUq1UPq15WfaZGVbNQ46kJ1Bar1akdVbupNq7OUndSj1DPUV+jvl/9gvpjDbKGhUaghkijVGO3xhmNIRbGMmXxWELWclYD6yxrmE1iW7L57Ex2Bfsbdi97TFNDc6pmrGaRZp3mcc0BDsax4PA52ZxKziHODc57LQMtPy2x1mqtZq1+rTfaetq+2mLtcu0W7eva73VwnUCdLJ31Om0693UJuja6UbqFutt1z+o+02PreekJ9cr1Dund0Uf1bfSj9Rfq79bv0R83MDQINpAZbDE4Y/DMkGPoa5hpuNHwhOGoEctoupHEaKPRSaMnuCbuh2fjNXgXPmasbxxirDTeZdxrPGFiaTLbpMSkxeS+Kc2Ua5pmutG003TMzMgs3KzYrMnsjjnVnGueYb7ZvNv8jYWlRZzFSos2i8eW2pZ8ywWWTZb3rJhWPlZ5VvVW16xJ1lzrLOtt1ldsUBtXmwybOpvLtqitm63Edptt3xTiFI8p0in1U27aMez87ArsmuwG7Tn2YfYl9m32zx3MHBId1jt0O3xydHXMdmxwvOuk4TTDqcSpw+lXZxtnoXOd8zUXpkuQyxKXdpcXU22niqdun3rLleUa7rrStdP1o5u7m9yt2W3U3cw9xX2r+00umxvJXcM970H08PdY4nHM452nm6fC85DnL152Xlle+70eT7OcJp7WMG3I28Rb4L3Le2A6Pj1l+s7pAz7GPgKfep+Hvqa+It89viN+1n6Zfgf8nvs7+sv9j/i/4XnyFvFOBWABwQHlAb2BGoGzA2sDHwSZBKUHNQWNBbsGLww+FUIMCQ1ZH3KTb8AX8hv5YzPcZyya0RXKCJ0VWhv6MMwmTB7WEY6GzwjfEH5vpvlM6cy2CIjgR2yIuB9pGZkX+X0UKSoyqi7qUbRTdHF09yzWrORZ+2e9jvGPqYy5O9tqtnJ2Z6xqbFJsY+ybuIC4qriBeIf4RfGXEnQTJAntieTE2MQ9ieNzAudsmjOc5JpUlnRjruXcorkX5unOy553PFk1WZB8OIWYEpeyP+WDIEJQLxhP5aduTR0T8oSbhU9FvqKNolGxt7hKPJLmnVaV9jjdO31D+miGT0Z1xjMJT1IreZEZkrkj801WRNberM/ZcdktOZSclJyjUg1plrQr1zC3KLdPZisrkw3keeZtyhuTh8r35CP5c/PbFWyFTNGjtFKuUA4WTC+oK3hbGFt4uEi9SFrUM99m/ur5IwuCFny9kLBQuLCz2Lh4WfHgIr9FuxYji1MXdy4xXVK6ZHhp8NJ9y2jLspb9UOJYUlXyannc8o5Sg9KlpUMrglc0lamUycturvRauWMVYZVkVe9ql9VbVn8qF5VfrHCsqK74sEa45uJXTl/VfPV5bdra3kq3yu3rSOuk626s91m/r0q9akHV0IbwDa0b8Y3lG19tSt50oXpq9Y7NtM3KzQM1YTXtW8y2rNvyoTaj9nqdf13LVv2tq7e+2Sba1r/dd3vzDoMdFTve75TsvLUreFdrvUV99W7S7oLdjxpiG7q/5n7duEd3T8Wej3ulewf2Re/ranRvbNyvv7+yCW1SNo0eSDpw5ZuAb9qb7Zp3tXBaKg7CQeXBJ9+mfHvjUOihzsPcw83fmX+39QjrSHkr0jq/dawto22gPaG97+iMo50dXh1Hvrf/fu8x42N1xzWPV56gnSg98fnkgpPjp2Snnp1OPz3Umdx590z8mWtdUV29Z0PPnj8XdO5Mt1/3yfPe549d8Lxw9CL3Ytslt0utPa49R35w/eFIr1tv62X3y+1XPK509E3rO9Hv03/6asDVc9f41y5dn3m978bsG7duJt0cuCW69fh29u0XdwruTNxdeo94r/y+2v3qB/oP6n+0/rFlwG3g+GDAYM/DWQ/vDgmHnv6U/9OH4dJHzEfVI0YjjY+dHx8bDRq98mTOk+GnsqcTz8p+Vv9563Or59/94vtLz1j82PAL+YvPv655qfNy76uprzrHI8cfvM55PfGm/K3O233vuO+638e9H5ko/ED+UPPR+mPHp9BP9z7nfP78L/eE8/sl0p8zAAAAIGNIUk0AAHolAACAgwAA+f8AAIDpAAB1MAAA6mAAADqYAAAXb5JfxUYAAAEpSURBVHjanNO9K4ZhFMfxz4MFg1Im8jJ5/gIvExMZJCnFpCRlYBEGGS0m+QMoLwOyGCwyWISEvJQklM0glFLeluvR3d3d08Nvua5O53w751y/K9Uz+SyiNIbRihq8Yh+LWMaXmPIi93Ec4QN9qEYjVjGBPZTHAQXhHMMg6nARy7nEAuawE3Keox2kMYWOhOKMPjAUYNPxEUYwjzPZ9Y1R9KMkCmjButx0g3M0RQEVuJe7bkPNL+AFRX8AlOI9CjhF/R8Ah9iNApbCcvJzBEzgOgpYxBtmkcpSWIuGJCd+ojO8xgaqEsw2gANsoy3JiQ9hDzO4Cp64Q3GIP6ALhcHa3diCVOwzQRmaUYknHOMkGAnasZKBFCTM+oi1LLvYRG+mkzz/UwYy8zMAmkpBg3fGpFUAAAAASUVORK5CYII=");
+}
+.doc-menu-vertical {
+    display: inline-block;
+    float: left;
+    width: 240px;
+    height: 100%;
+    background-color: #ecedee;
+    position: absolute;
+    left: 0;
+    top: 0;
+    overflow: hidden;
+    padding: 0;
+    border-right: 1px solid #dddfe3;
+}
+.doc-menu-vertical > ul {
+    display: none;
+}
+.doc-menu-vertical > ul.current{
+    display: block;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1 {
+    display: none;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1.current {
+    display: block;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1.current > a {
+    display: none;
+}
+.doc-menu-vertical .toctree-l2  a {
+    width: 100%;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+    padding-right: 30px;
+}
+.doc-menu-vertical .toctree-l2 > a {
+    font-size: 14px;
+    color: #2f323a;
+    padding-left: 30px;
+    line-height: 50px;
+    display: block;
+    font-weight: bold;
+    border-bottom: 1px solid #dddfe3;
+}
+.doc-menu-vertical .toctree-l2.has-child > a:after {
+    font-family: "FontAwesome";
+    display: inline-block;
+    font-style: normal;
+    font-weight: normal;
+    text-decoration: inherit;
+    content: "";
+    float: right;
+    line-height: 50px;
+    color: #a7adbd;
+    position: absolute;
+    right: 15px;
+}
+.doc-menu-vertical .toctree-l2.has-child.current > a:after {
+    content: "";
+}
+.doc-menu-vertical .toctree-l2 > a + ul{
+    background-color: #e4e6e9;
+    height: 0;
+    overflow: hidden;
+}
+.doc-menu-vertical .toctree-l2.current > a + ul {
+    border-bottom: 1px solid #dddfe3;
+    height: auto;
+}
+.doc-menu-vertical .toctree-l2 li.active > a {
+    background-color: #597cf1;
+    color: #fff;
+}
+.doc-menu-vertical .toctree-l3 > a {
+    font-size: 12px;
+    color: #2f323a;
+    padding-left: 30px;
+    line-height: 40px;
+    display: block;
+}
+.doc-menu-vertical .toctree-l4 > a {
+    font-size: 12px;
+    color: #64697b;
+    padding-left: 50px;
+    line-height: 30px;
+    display: block;
+}
+.doc-menu-vertical .toctree-l5 > a {
+    font-size: 14px;
+    color: #ccc;
+    padding-left: 40px;
+    display: block;
+}
+.local-toc {
+    position: absolute;
+    height: 100%;
+    background-color: #f6f7f8;
+    top: 0;
+    left: 240px;
+    padding: 0;
+    z-index: 9;
+}
+.local-toc:after {
+    content: "";
+    position: absolute;
+    height: 100%;
+    width: 1px;
+    display: inline-block;
+    right: 0;
+    background-color: #dddfe3;
+    top: 0;
+    z-index: -1;
+}
+.local-toc:hover a {
+    width: auto;
+}
+.local-toc > ul > li a {
+    position: relative;
+    font-size: 12px;
+    overflow: hidden;
+    display: none;
+}
+.local-toc > ul > li > ul > li a {
+    display: block;
+    border-top: 1px solid transparent;
+    border-bottom: 1px solid transparent;
+    padding-right: 20px;
+    width: 50px;
+}
+.local-toc > ul > li > ul > li > ul > li > ul a {
+    display: none;
+}
+.local-toc > ul > li > ul li > a:after {
+    content: "";
+    display: inline-block;
+    width: 1px;
+    height: 100%;
+    background-color: transparent;
+    position: absolute;
+    right: 0;
+    top: 0;
+}
+.local-toc > ul > li > ul li a:hover{
+    background-color: #e6eaf7 !important;
+}
+.local-toc > ul > li > ul li a:hover:after {
+    background-color: #e6eaf7 !important;
+}
+.local-toc > ul > li > ul li.active > a {
+    color: #ff9711;
+    background-color: #fff;
+    border-top: 1px solid #dddfe3;
+    border-bottom: 1px solid #dddfe3;
+}
+.local-toc > ul > li > ul li.active > a:before {
+    background-color: #ff9711;
+    width: 10px;
+    height: 10px;
+    margin: 15px 20px;
+    border-radius: 5px;
+}
+.local-toc > ul > li > ul li.active > a:after {
+    background-color: #fff;
+}
+.local-toc > ul > li > ul > li {
+    position: relative;
+    line-height: 40px;
+    white-space: nowrap;
+}
+.local-toc > ul > li > ul > li > a {
+    color: #64697b;
+}
+.local-toc > ul > li > ul > li > a + ul {
+    display: none;
+}
+.local-toc > ul > li > ul > li > a:before {
+    display: inline-block;
+    content: "";
+    width: 6px;
+    height: 6px;
+    background-color: #ccc;
+    border-radius: 3px;
+    margin: 17px 22px;
+    float: left;
+}
+.local-toc > ul > li > ul > li > ul > li > a {
+    color: #a7adbd;
+}
+.local-toc > ul > li > ul > li > ul > li > a:before {
+    display: inline-block;
+    content: "";
+    width: 6px;
+    height: 6px;
+    background-color: #ccc;
+    border-radius: 3px;
+    margin: 17px 22px;
+    float: left;
+}
+.main-content-wrap {
+    position: absolute;
+    width: 100%;
+    top: 80px;
+    bottom: 0;
+    overflow: auto;
+    background-color: #f6f7f8;
+}
+.doc-content-wrap {
+    margin-left: 290px;
+    height: 100%;
+    position: relative;
+    padding-top: 60px;
+    background-color: #fff;
+}
+.doc-content-wrap > div[role='navigation'] {
+    position: absolute;
+    top: 0;
+    width: 100%;
+    left: 0;
+    padding: 0 30px;
+    height: 60px;
+}
+.wy-breadcrumbs {
+    line-height: 50px;
+    height: 60px;
+    background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAUCAYAAABMDlehAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAA4ZpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuNS1jMDIxIDc5LjE1NTc3MiwgMjAxNC8wMS8xMy0xOTo0NDowMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wTU09Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9tbS8iIHhtbG5zOnN0UmVmPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvc1R5cGUvUmVzb3VyY2VSZWYjIiB4bWxuczp4bXA9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC8iIHhtcE1NOk9yaWdpbmFsRG9jdW1lbnRJRD0ieG1wLmRpZDpjMjhmMGQ3ZC0wODU3LTQ0ZTctOGRhZi00NGU3OTc1ZmM2MzkiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6NzRBN0NEODRBRTM2MTFFNjlGMDI4RUM3M0VDQzY4NTkiIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6NzRBN0NEODNBRTM2MTFFNjlGMDI4RUM3M0VDQzY4NTkiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENDIDIwMTQgKE1hY2ludG9zaCkiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDozNWQwMzI1ZC01ZDAyLTQ1YTYtODUxOS1lNWUzNjU5NGFhMzAiIHN0UmVmOmRvY3VtZW50SUQ9ImFkb2JlOmRvY2lkOnBob3Rvc2hvcDozZGVmZmY0OS1mNjA4LTExNzktYTRlZC1kZjJiNGY3N2YwNzMiLz4gPC9yZGY6RGVzY3JpcHRpb24+IDwvcmRmOlJERj4gPC94OnhtcG1ldGE+IDw/eHBhY2tldCBlbmQ9InIiPz7FGmP1AAAAKUlEQVR42mK4/+DpfwY9Q0tBJgYGhv8g4h8uFoKLEGOAc9FYSARAgAEAUgMQYBNmQ7sAAAAASUVORK5CYII=");
+    background-repeat: repeat no-repeat;
+    background-position: center 50px;
+}
+.wy-breadcrumbs > li {
+    color: #ccc;
+}
+.wy-breadcrumbs > li a {
+    color: #ff9711;
+    padding: 0;
+}
+.wy-breadcrumbs > li:first-child a {
+    color: #597cf1;
+}
+.wy-nav-content{
+    max-width: none;
+    overflow: auto;
+    position: relative;
+    padding: 30px;
+    background-color: #fff;
+}
+.wy-nav-content h1 {
+    font-size: 24px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h2 {
+    font-size: 20px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h3 {
+    font-size: 18px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h4 {
+    font-size: 16px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content p + h1,
+.wy-nav-content p + h2,
+.wy-nav-content p + h3,
+.wy-nav-content p + h4 {
+    margin-top: 20px;
+}
+.wy-nav-content p{
+    color: #2f323a;
+    margin-bottom: 20px;
+    font-size: 14px;
+}
+#search-results h2 {
+    font-size: 24px;
+    margin: 20px 0 10px 0;
+}
+#search-results p {
+    color: #a7adbd;
+}
+#search-results ul.search > li {
+    border-bottom: none;
+}
+#search-results ul.search > li > a {
+    color: #597cf1;
+}
+.rst-content .highlighted{
+    background-color: transparent;
+    color: #ff9711;
+    padding: 0;
+}
\ No newline at end of file
--- a/doc_theme/static/images/PP_w.png
+++ b/doc_theme/static/images/PP_w.png
--- a/doc_theme/static/js/paddle_doc_init.js
+++ b/doc_theme/static/js/paddle_doc_init.js
+$(document).ready(function(){
+    $('.local-toc').on('click' ,'a.reference.internal', function (){
+        $('.local-toc li.active').removeClass('active');
+        $(this).parent('li').addClass('active');
+    });
+
+    if ($('.local-toc a:visible').length) {
+        $('.local-toc > ul').addClass('nav nav-stacked');
+        $('#doc-content').scrollspy({
+            target: '.local-toc'
+        });
+		$('.local-toc').perfectScrollbar();
+    } else {
+		$('.doc-content-wrap').css('margin-left', '-=50px');
+        $('.local-toc').remove();
+    }
+
+    if (!$('.doc-menu-vertical > ul > li.current > ul').length) {
+        $('.doc-content-wrap').css('margin-left', '-=240px');
+        $('.doc-menu-vertical').remove();
+        $('.local-toc').css('left', '0');
+    }
+
+	$('.doc-menu-vertical .toctree-l2').each(function (i, e){
+        $(e).toggleClass('has-child', !!$(e).find('ul').length);
+    });
+
+    $('.doc-menu-vertical').find('li.current').last().addClass('active');
+
+    $('.doc-menu-vertical').perfectScrollbar();
+});
\ No newline at end of file
--- a/doc_theme/templates/breadcrumbs.html
+++ b/doc_theme/templates/breadcrumbs.html
+{# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #}
+
+{% if page_source_suffix %} 
+{% set suffix = page_source_suffix %}
+{% else %}
+{% set suffix = source_suffix %}
+{% endif %}
+
+{% if meta is defined and 'github_url' in meta %}
+{% set display_github = True %}
+{% endif %}
+
+{% if meta is defined and 'bitbucket_url' in meta %}
+{% set display_bitbucket = True %}
+{% endif %}
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+      {% for doc in parents %}
+        <li><a href="{{ doc.link|e }}">{{ doc.title }}</a> > </li>
+      {% endfor %}
+    <li>{{ title }}</li>
+  </ul>
+</div>
--- a/doc_theme/templates/layout.html
+++ b/doc_theme/templates/layout.html
+{# TEMPLATE VAR SETTINGS #}
+{%- set url_root = pathto('', 1) %}
+{%- if url_root == '#' %}{% set url_root = '' %}{% endif %}
+{%- if not embedded and docstitle %}
+  {%- set titlesuffix = " &mdash; "|safe + docstitle|e %}
+{%- else %}
+  {%- set titlesuffix = "" %}
+{%- endif %}
+
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  {{ metatags }}
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  {% block htmltitle %}
+  <title>{{ title|striptags|e }}{{ titlesuffix }}</title>
+  {% endblock %}
+
+  {# FAVICON #}
+  {% if favicon %}
+    <link rel="shortcut icon" href="{{ pathto('_static/' + favicon, 1) }}"/>
+  {% endif %}
+
+  {# CSS #}
+
+  {# OPENSEARCH #}
+  {% if not embedded %}
+    {% if use_opensearch %}
+      <link rel="search" type="application/opensearchdescription+xml" title="{% trans docstitle=docstitle|e %}Search within {{ docstitle }}{% endtrans %}" href="{{ pathto('_static/opensearch.xml', 1) }}"/>
+    {% endif %}
+
+  {% endif %}
+
+  {# RTD hosts this file, so just load on non RTD builds #}
+  {% if not READTHEDOCS %}
+    <link rel="stylesheet" href="{{ pathto('_static/' + style, 1) }}" type="text/css" />
+  {% endif %}
+
+  {% for cssfile in css_files %}
+    <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" />
+  {% endfor %}
+  {% for cssfile in extra_css_files %}
+    <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" />
+  {% endfor %}
+
+  {%- block linktags %}
+    {%- if hasdoc('about') %}
+        <link rel="author" title="{{ _('About these documents') }}"
+              href="{{ pathto('about') }}"/>
+    {%- endif %}
+    {%- if hasdoc('genindex') %}
+        <link rel="index" title="{{ _('Index') }}"
+              href="{{ pathto('genindex') }}"/>
+    {%- endif %}
+    {%- if hasdoc('search') %}
+        <link rel="search" title="{{ _('Search') }}" href="{{ pathto('search') }}"/>
+    {%- endif %}
+    {%- if hasdoc('copyright') %}
+        <link rel="copyright" title="{{ _('Copyright') }}" href="{{ pathto('copyright') }}"/>
+    {%- endif %}
+    <link rel="top" title="{{ docstitle|e }}" href="{{ pathto('index') }}"/>
+    {%- if parents %}
+        <link rel="up" title="{{ parents[-1].title|striptags|e }}" href="{{ parents[-1].link|e }}"/>
+    {%- endif %}
+    {%- if next %}
+        <link rel="next" title="{{ next.title|striptags|e }}" href="{{ next.link|e }}"/>
+    {%- endif %}
+    {%- if prev %}
+        <link rel="prev" title="{{ prev.title|striptags|e }}" href="{{ prev.link|e }}"/>
+    {%- endif %}
+  {%- endblock %}
+  {%- block extrahead %} 
+
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" type="text/css" />
+  <link rel="stylesheet" href="{{pathto('_static/css/override.css', 1)}}" type="text/css" />
+  <script>
+  var _hmt = _hmt || [];
+  (function() {
+    var hm = document.createElement("script");
+    hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
+    var s = document.getElementsByTagName("script")[0]; 
+    s.parentNode.insertBefore(hm, s);
+  })();
+  </script>
+
+  {% endblock %}
+
+  {# Keep modernizr in head - http://modernizr.com/docs/#installing #}
+  <script src="{{ pathto('_static/js/modernizr.min.js', 1) }}"></script>
+
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  {% block extrabody %}
+  <header class="site-header">
+    <div class="site-logo">
+      <a href="/"><img src="{{pathto('_static/images/PP_w.png', 1)}}"></a>
+    </div>
+    <div class="site-nav-links">
+      <div class="site-menu">
+        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Folk me on Github</a>
+        <div class="language-switcher dropdown">
+          <a type="button" data-toggle="dropdown">
+            <span>English</span>
+            <i class="fa fa-angle-up"></i>
+            <i class="fa fa-angle-down"></i>
+          </a>
+          <ul class="dropdown-menu">
+            <li><a href="/doc_cn">中文</a></li>
+            <li><a href="/doc">English</a></li>
+          </ul>
+        </div>
+        <ul class="site-page-links">
+          <li><a>Home</a></li>
+          <li><a>Get Started</a></li>
+          <li class="active"><a>Documentation</a></li>
+          <li><a>About Us</a></li>
+        </ul>
+      </div>
+      <div class="doc-module">
+        {%set modules = toctree(maxdepth=0, collapse=False, titles_only=True)%}
+        {{modules}}
+        {% include "searchbox.html" %}        
+      </div>
+    </div>
+  </header>
+  {% endblock %}
+  <div class="main-content-wrap">
+
+    {# SIDE NAV, TOGGLES ON MOBILE #}
+    <nav class="doc-menu-vertical" role="navigation">
+        {% block menu %}
+          {% set toctree = toctree(maxdepth=-1, collapse=False,titles_only=True, includehidden=True) %}
+          {{ toctree }}
+        {% endblock %}
+    </nav>
+    {% if toc %}
+    <nav class="local-toc">{{ toc }}</nav>
+    {% endif %}
+    <section class="doc-content-wrap">
+
+      {% include "breadcrumbs.html" %}
+      {# PAGE CONTENT #}
+      <div class="wy-nav-content" id="doc-content">
+        <div class="rst-content">
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+            {% block body %}{% endblock %}
+           </div>
+          </div>
+          {% include "footer.html" %}
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+  {% include "versions.html" %}
+
+  {% if not embedded %}
+
+    <script type="text/javascript">
+        var DOCUMENTATION_OPTIONS = {
+            URL_ROOT:'{{ url_root }}',
+            VERSION:'{{ release|e }}',
+            COLLAPSE_INDEX:false,
+            FILE_SUFFIX:'{{ '' if no_search_suffix else file_suffix }}',
+            HAS_SOURCE:  {{ has_source|lower }}
+        };
+    </script>
+    {%- for scriptfile in script_files %}
+      <script type="text/javascript" src="{{ pathto(scriptfile, 1) }}"></script>
+    {%- endfor %}
+       
+  {% endif %}
+
+  {# RTD hosts this file, so just load on non RTD builds #}
+  {% if not READTHEDOCS %}
+    <script type="text/javascript" src="{{ pathto('_static/js/theme.js', 1) }}"></script>
+  {% endif %}
+  
+  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/js/perfect-scrollbar.jquery.min.js"></script>
+  <script src="{{ pathto('_static/js/paddle_doc_init.js', 1) }}"></script>
+  {%- block footer %} {% endblock %}
+
+</body>
+</html>
--- a/doc_theme/templates/search.html
+++ b/doc_theme/templates/search.html
+{#
+    basic/search.html
+    ~~~~~~~~~~~~~~~~~
+
+    Template for the search page.
+
+    :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+#}
+{%- extends "layout.html" %}
+{% set title = _('Search') %}
+{% set script_files = script_files + ['_static/searchtools.js'] %}
+{% block footer %}
+  <script type="text/javascript">
+    jQuery(function() { Search.loadIndex("{{ pathto('searchindex.js', 1) }}"); });
+    jQuery('.doc-content-wrap > div[role="navigation"]').remove();
+    jQuery('.doc-content-wrap').css('padding-top', 0);
+  </script>
+  {# this is used when loading the search index using $.ajax fails,
+     such as on Chrome for documents on localhost #}
+  <script type="text/javascript" id="searchindexloader"></script>
+  {{ super() }}
+{% endblock %}
+{% block body %}
+  <noscript>
+  <div id="fallback" class="admonition warning">
+    <p class="last">
+      {% trans %}Please activate JavaScript to enable the search
+      functionality.{% endtrans %}
+    </p>
+  </div>
+  </noscript>
+
+  {% if search_performed %}
+    <h2>{{ _('Search Results') }}</h2>
+    {% if not search_results %}
+      <p>{{ _('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.') }}</p>
+    {% endif %}
+  {% endif %}
+  <div id="search-results">
+  {% if search_results %}
+    <ul>
+    {% for href, caption, context in search_results %}
+      <li>
+        <a href="{{ pathto(item.href) }}">{{ caption }}</a>
+        <p class="context">{{ context|e }}</p>
+      </li>
+    {% endfor %}
+    </ul>
+  {% endif %}
+  </div>
+{% endblock %}
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -27,11 +27,6 @@ Arguments* Arguments::createArguments(size_t slotNum) {

 void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }

-Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.value);
-}
-
 Arguments::Arguments() : m(new ArgumentsPrivate()) {}

 Arguments::~Arguments() { delete m; }
@@ -43,6 +38,16 @@ Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
  return args;
 }

+Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.value);
+}
+
+Matrix* Arguments::getSlotGrad(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.grad);
+}
+
 IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
  auto& a = m->getArg(idx);
  return IVector::createByPaddleVectorPtr(&a.ids);
@@ -58,6 +63,11 @@ void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
  a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
 }

+void Arguments::setSlotGrad(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.grad = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
 void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
  auto& a = m->getArg(idx);
  a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());

--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@@ -193,5 +193,4 @@ namespace std {
 %ignore OptimizationConfigPrivate;
 %ignore ParameterTraverseCallbackPrivate;
 %include "utils/GlobalConstants.h"
-%include "api/PaddleAPI.h"
-
+%include "api/PaddleAPI.h"
\ No newline at end of file
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -156,12 +156,15 @@ public:
   *  @param dim1  dimension of data.
   *  @param dim2  dimension of data.
   *  @param copy  true if copy into a new matrix, false will create
-   *               matrix inplace.
+   *               matrix inplace. copy = false should be used with extreme
+   *               care because Matrix will share the memory with the given
+   *               numpy array. If the numpy array object is no longer valid,
+   *               the memory space will not be usable.
   */
  static Matrix* createCpuDenseFromNumpy(float* data,
                                         int dim1,
                                         int dim2,
-                                         bool copy = false);
+                                         bool copy = true);

  /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
@@ -271,11 +274,18 @@ public:
   */
  static Vector* createCpuVectorFromNumpy(float* data,
                                          int dim,
-                                          bool copy = false);
+                                          bool copy = true);

  /// Create Gpu Vector from numpy array, which dtype=float32
  static Vector* createGpuVectorFromNumpy(float* data, int dim);

+  /**
+   * copy from another vector
+   * throw(RangeError) if size of src vector is different from size of this
+   * vector
+   */
+  void copyFrom(Vector* src) throw(RangeError);
+
  /// Cast to numpy array inplace.
  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);

@@ -339,7 +349,7 @@ public:
   */
  static IVector* createCpuVectorFromNumpy(int* data,
                                           int dim,
-                                           bool copy = false);
+                                           bool copy = true);
  /**
   * Create Gpu IVector from numpy array, which dtype=int32
   */
@@ -418,6 +428,7 @@ public:
   * the param idx is the slot id
   */
  Matrix* getSlotValue(size_t idx) const throw(RangeError);
+  Matrix* getSlotGrad(size_t idx) const throw(RangeError);
  IVector* getSlotIds(size_t idx) const throw(RangeError);
  Matrix* getSlotIn(size_t idx) const throw(RangeError);
  IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
@@ -434,6 +445,7 @@ public:
   * The other param is the input Matrix or vector.
   */
  void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotGrad(size_t idx, Matrix* mat) throw(RangeError);
  void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
  void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
  void setSlotSequenceStartPositions(size_t idx,
@@ -535,6 +547,7 @@ public:
  size_t getID() const;

  ParameterConfig* getConfig();
+  void setValueUpdated();

 private:
  static Parameter* createFromRawPtr(void* ptr);

--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -68,3 +68,5 @@ ParameterConfig* Parameter::getConfig() {
 }

 size_t Parameter::getID() const { return m->getPtr()->getID(); }
+
+void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -281,6 +281,13 @@ FloatArray Vector::getData() const {
  }
 }

+void Vector::copyFrom(Vector* src) throw(RangeError) {
+  if (src->m->vec->getSize() !=  m->vec->getSize()) {
+    throw RangeError();
+  }
+  m->vec->copyFrom(*src->m->vec);
+}
+
 bool Vector::isGpu() const {
  return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
 }

--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -68,7 +68,7 @@ class TestMatrix(unittest.TestCase):

    def test_numpyCpu(self):
        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat)
+        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, copy=False)
        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
                         numpy_mat.shape)


--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -43,7 +43,7 @@ class TestIVector(unittest.TestCase):

    def test_cpu_numpy(self):
        vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec)
+        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, copy=False)
        self.assertEqual(vec.shape[0], int(iv.__len__()))
        vec[4] = 832
        for i in xrange(len(iv)):
@@ -107,7 +107,7 @@ class TestVector(unittest.TestCase):

    def testCpuNumpy(self):
        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr)
+        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, copy=False)
        assert isinstance(vec, swig_paddle.Vector)
        numpy_arr[0] = 0.1
        for n, v in zip(numpy_arr, vec):
@@ -152,4 +152,4 @@ if __name__ == '__main__':
    unittest.TextTestRunner().run(suite)
    if swig_paddle.isGpuVersion():
        swig_paddle.setUseGpu(True)
-        unittest.main()
\ No newline at end of file
+        unittest.main()
--- a/paddle/api/test/util.py
+++ b/paddle/api/test/util.py
@@ -24,7 +24,9 @@ def doubleEqual(a, b):

 def __readFromFile():
    for i in xrange(10002):
-        yield np.random.rand(784), random.randint(0, 9)
+        label = np.random.randint(0, 9)
+        sample = np.random.rand(784) + 0.1 * label
+        yield sample, label


 def loadMNISTTrainData(batch_size=100):

--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -271,7 +271,9 @@ public:
  void finishAsyncLoad() {
    stopping_ = true;
    taskReadySem_.post();
-    asyncLoader_->join();
+    if (asyncLoader_) {
+      asyncLoader_->join();
+    }
  }

  void setPending(bool pending) { pending_ = pending; }

--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -60,18 +60,16 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,

 void BatchNormBaseLayer::calFeatureMapSize() {
  const ImageConfig& conf = config_.inputs(0).image_conf();
-  if (inputLayers_[0]->getOutput().getFrameHeight() == 0 &&
-      inputLayers_[0]->getOutput().getFrameWidth() == 0) {
-    imgSize_ = conf.img_size();
-    imageH_ = imgSize_;
-    imageW_ = imgSize_;
+  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imageH_ == 0 && imageW_ == 0) {
+    imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+    imageW_ = conf.img_size();
  } else {
-    imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-    imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+    getOutput().setFrameHeight(imageH_);
+    getOutput().setFrameWidth(imageW_);
  }
  imgPixels_ = imageH_ * imageW_;
-  getOutput().setFrameHeight(imageH_);
-  getOutput().setFrameWidth(imageW_);
 }

 }  // namespace paddle
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -77,9 +77,8 @@ protected:
  MatrixPtr savedMean_;
  MatrixPtr savedInvVar_;

-  /// Height or width of input image feature, now height is equal to width.
-  /// imgSize is 1 if the input is fully-connected layer.
-  int imgSize_;
+  /// Height or width of input image feature.
+  /// Both of them are 1 if the input is fully-connected layer.
  int imageH_;
  int imageW_;
  /// Height * Width.

--- a/paddle/gserver/layers/BilinearInterpLayer.cpp
+++ b/paddle/gserver/layers/BilinearInterpLayer.cpp
@@ -26,15 +26,15 @@ size_t BilinearInterpLayer::getSize() {

  const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
  if (inImgH_ == 0) {
-    inImgH_ = conf.img_size_y();
+    inImgH_ = conf.image_conf().img_size_y();
  }
  if (inImgW_ == 0) {
-    inImgW_ = conf.img_size_x();
+    inImgW_ = conf.image_conf().img_size();
  }

  outImgH_ = conf.out_size_y();
  outImgW_ = conf.out_size_x();
-  numChannels_ = conf.num_channels();
+  numChannels_ = conf.image_conf().channels();

  CHECK(outImgH_ > 0 && outImgW_ > 0);
  CHECK(inImgH_ > 0 && inImgW_ > 0);

--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -38,11 +38,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    filterSizeY_.push_back(conf.filter_size_y());
    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
    channels_.push_back(conf.channels());
-    imgSizeH_.push_back(conf.img_size());
+    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
+                                              : conf.img_size());
    imgSizeW_.push_back(conf.img_size());
    groups_.push_back(conf.groups());
    filterChannels_.push_back(conf.filter_channels());
-    outputH_.push_back(conf.output_x());
+    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
    outputW_.push_back(conf.output_x());
  }

@@ -91,16 +92,19 @@ size_t ConvBaseLayer::calOutputSize() {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+      const ConvConfig& conf = config_.inputs(i).conv_conf();
      if (isDeconv_) {
-        if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().output_x();
-        if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().output_x();
+        if (inH[i] == 0)
+          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
+        if (inW[i] == 0) inW[i] = conf.output_x();
        outH.push_back(imageSize(
            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
        outW.push_back(imageSize(
            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
      } else {
-        if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().img_size();
-        if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().img_size();
+        if (inH[i] == 0)
+          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+        if (inW[i] == 0) inW[i] = conf.img_size();
        outH.push_back(outputSize(
            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
        outW.push_back(outputSize(

--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -93,9 +93,9 @@ private:
  bool caffeMode_;
  int inputOffset_, outputOffset_, weightOffset_;
  int numFilters_;
-  int padding_, stride_, filterSize_, channels_, imgSize_;
+  int padding_, stride_, filterSize_, channels_, imgSize_, imgSizeY_;
  int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputs_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;

  /// Following member variables are same with CudnnConvLayer.
  /// There is no explanation here.
@@ -144,7 +144,7 @@ void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
 void ConvOperator::reshape(int batchSize) {
  imageH_ = ins_[0]->getFrameHeight();
  imageW_ = ins_[0]->getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSize_;
+  if (imageH_ == 0) imageH_ = imgSizeY_;
  if (imageW_ == 0) imageW_ = imgSize_;
  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
@@ -182,7 +182,10 @@ void ConvOperator::computeConvSizes() {
  hl_create_tensor_descriptor(&inputDesc_);
  int outputX =
      outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_);
+  int outputY =
+      outputSize(imgSizeY_, filterSizeY_, paddingY_, strideY_, caffeMode_);
  CHECK_EQ(outputX, outputX_);
+  CHECK_EQ(outputY, outputY_);
  hl_create_tensor_descriptor(&outputDesc_);
  hl_create_convolution_descriptor(&convDesc_,
                                   inputDesc_,
@@ -236,10 +239,12 @@ void ConvOperator::getConvParams() {
  filterPixels_ = filterSize_ * filterSizeY_;
  channels_ = conf.channels();
  imgSize_ = conf.img_size();
-  imgPixels_ = imgSize_ * imgSize_;
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  imgPixels_ = imgSize_ * imgSizeY_;
  CHECK_EQ(conf.groups(), 1U);
  filterChannels_ = conf.filter_channels();
  outputX_ = conf.output_x();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
  outputs_ = outputX_ * outputX_;
 }


--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -46,7 +46,7 @@ void ConvProjection::getConvParams() {
  filterH_ = conf.filter_size_y();
  filterW_ = conf.filter_size();

-  configImgH_ = conf.img_size();
+  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
  configImgW_ = conf.img_size();

  channels_ = conf.channels();
@@ -58,8 +58,11 @@ void ConvProjection::getConvParams() {
 }

 void ConvProjection::initCudnn() {
-  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterH_, filterW_);
+  hl_create_filter_descriptor(&filterDesc_,
+                              channels_ / groups_,
+                              numFilters_ / groups_,
+                              filterH_,
+                              filterW_);
  hl_create_tensor_descriptor(&inputDesc_);
  hl_create_tensor_descriptor(&outputDesc_);
  hl_create_convolution_descriptor(&convDesc_,
@@ -86,7 +89,7 @@ void ConvProjection::initCudnn() {
 void ConvProjection::reshapeTensorDesc(int batchSize) {
  hl_tensor_reshape(inputDesc_,
                    batchSize,
-                    channels_,
+                    channels_ / groups_,
                    imageH_,
                    imageW_,
                    channels_ * imageH_ * imageW_,
@@ -115,7 +118,7 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {

  hl_tensor_reshape(outputDesc_,
                    batchSize,
-                    numFilters_,
+                    numFilters_ / groups_,
                    outputH_,
                    outputW_,
                    nStride,

--- a/paddle/gserver/layers/DataLayer.cpp
+++ b/paddle/gserver/layers/DataLayer.cpp
@@ -49,8 +49,13 @@ void DataLayer::copyDataToOutput(Argument& output) {
      output.ids->copyFrom(*data_.ids);
    }
  }
-  output.setFrameHeight(data_.getFrameHeight());
-  output.setFrameWidth(data_.getFrameWidth());
+  if (config_.height() && config_.width()) {
+    output.setFrameHeight(config_.height());
+    output.setFrameWidth(config_.width());
+  } else {
+    output.setFrameHeight(data_.getFrameHeight());
+    output.setFrameHeight(data_.getFrameHeight());
+  }
  output.cpuSequenceDims = data_.cpuSequenceDims;
  output.sequenceStartPositions = data_.sequenceStartPositions;
  output.subSequenceStartPositions = data_.subSequenceStartPositions;

--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -29,17 +29,19 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
   * meaning as in conv, we need to swap channels_ and numFilters here for
   * convTrans, and in other functions too.
   * */
-  int channel;
-  int numFilters;
+
  /* Initialize the projection */
  for (auto &inputConfig : config_.inputs()) {
    const ConvConfig &conf = inputConfig.conv_conf();
-    numFilters = isDeconv_ ? conf.channels() : numFilters_;
+    int numFilters = isDeconv_ ? conf.channels() : numFilters_;
    subM_.push_back(numFilters / conf.groups());
-    subN_.push_back(conf.output_x() * conf.output_x());
-    channel = isDeconv_ ? numFilters_ : conf.channels();
-    subK_.push_back(channel * conf.filter_size() * conf.filter_size() /
-                    conf.groups());
+    subN_.push_back(conf.output_x() *
+                    (conf.has_output_y() ? conf.output_y() : conf.output_x()));
+    int channel = isDeconv_ ? numFilters_ : conf.channels();
+    subK_.push_back(
+        channel * conf.filter_size() *
+        (conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) /
+        conf.groups());
    /* Consistent caffe mode for multiple input */
    caffeMode_ = conf.caffe_mode();
  }
@@ -116,11 +118,11 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
                           imgSizeH_[inIdx],
                           imgSizeW_[inIdx],
                           channel,
+                           filterSizeY_[inIdx],
                           filterSize_[inIdx],
-                           filterSize_[inIdx],
+                           strideY_[inIdx],
                           stride_[inIdx],
-                           stride_[inIdx],
-                           padding_[inIdx],
+                           paddingY_[inIdx],
                           padding_[inIdx],
                           outputH_[inIdx],
                           outputW_[inIdx]);
@@ -145,7 +147,7 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
  real *expInData = expandInput_->getData();
  for (int g = 0; g < groups_[inIdx]; ++g) {
    MatrixPtr A =
-        Matrix::create(wgtData, subK, subM, true, useGpu_);  // mark transpose
+        Matrix::create(wgtData, subM, subK, false, useGpu_);  // mark transpose
    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
    MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
    C->mul(A, B, 1, 1);
@@ -182,7 +184,7 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
      // create temporary matrix
      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
      MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subK, subM, false, useGpu_);
+      MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_);
      C->mul(A, B);  // mul

      // clear the temporary matrix
@@ -208,11 +210,11 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
                     imgSizeH_[inpIdx],
                     imgSizeW_[inpIdx],
                     channel,
+                     filterSizeY_[inpIdx],
                     filterSize_[inpIdx],
-                     filterSize_[inpIdx],
-                     stride_[inpIdx],
+                     strideY_[inpIdx],
                     stride_[inpIdx],
-                     padding_[inpIdx],
+                     paddingY_[inpIdx],
                     padding_[inpIdx],
                     outputH_[inpIdx],
                     outputW_[inpIdx],
@@ -247,10 +249,10 @@ void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,

    // expand-mul one-group by one
    for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(gradData, subM, subN, true, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subK, subM, false, useGpu_);
-      C->mul(A, B, 1, 1);
+      MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_);
+      MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_);
+      MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_);
+      C->mul(B, A, 1, 1);

      A->clear();
      B->clear();

--- a/paddle/gserver/layers/MaxOutLayer.cpp
+++ b/paddle/gserver/layers/MaxOutLayer.cpp
@@ -25,10 +25,10 @@ size_t MaxOutLayer::getSize() {
  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
  if (imgSizeH_ == 0) {
-    imgSizeH_ = maxoutConf.img_size_y();
+    imgSizeH_ = maxoutConf.image_conf().img_size_y();
  }
  if (imgSizeW_ == 0) {
-    imgSizeW_ = maxoutConf.img_size_x();
+    imgSizeW_ = maxoutConf.image_conf().img_size();
  }

  featLen_ = imgSizeH_ * imgSizeW_;
@@ -50,7 +50,7 @@ bool MaxOutLayer::init(const LayerMap& layerMap,

  const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
  groups_ = conf.groups();
-  channels_ = conf.channels();
+  channels_ = conf.image_conf().channels();
  CHECK_EQ(channels_ % groups_, 0UL);
  outputChannels_ = channels_ / groups_;


--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -48,6 +48,9 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
  outputX_ = conf.output_x();
  imgSize_ = conf.img_size();
  denoms_ = NULL;
+
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
  return true;
 }


--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -49,7 +49,7 @@ public:
 */
 class ResponseNormLayer : public NormLayer {
 protected:
-  size_t channels_, size_, outputX_, imgSize_;
+  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
  float scale_, pow_;
  MatrixPtr denoms_;


--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -23,7 +23,7 @@ size_t CMRProjectionNormLayer::getSize() {
  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
  if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSize_;
+    imgSizeH_ = imgSizeY_;
  }
  if (imgSizeW_ == 0) {
    imgSizeW_ = imgSize_;

--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
@@ -56,14 +56,14 @@ ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW,
 size_t SpatialPyramidPoolLayer::getSize() {
  CHECK_EQ(inputLayers_.size(), 1UL);
  size_t layerSize = 0;
-  const SppConfig& sppConf = config_.inputs(0).spp_conf();
+  const ImageConfig& conf = config_.inputs(0).spp_conf().image_conf();
  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
  if (imgSizeH_ == 0) {
-    imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+    imgSizeH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
  }
  if (imgSizeW_ == 0) {
-    imgSizeW_ = sppConf.img_size();
+    imgSizeW_ = conf.img_size();
  }

  size_t outputH = 1;
@@ -82,9 +82,10 @@ bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap,
  pyramidHeight_ = sppConf.pyramid_height();
  poolType_ = sppConf.pool_type();

-  channels_ = sppConf.channels();
-  imgSizeW_ = sppConf.img_size();
-  imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+  const ImageConfig& imageConf = sppConf.image_conf();
+  channels_ = imageConf.channels();
+  imgSizeW_ = imageConf.img_size();
+  imgSizeH_ = imageConf.has_img_size_y() ? imageConf.img_size_y() : imgSizeW_;
  poolProjections_.reserve(pyramidHeight_);
  projCol_.reserve(pyramidHeight_);
  projOutput_.resize(pyramidHeight_);

--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -34,7 +34,22 @@ add_unittest_without_exec(test_ConvTrans

 add_test(NAME test_ConvTrans
    COMMAND test_ConvTrans)
+################# test_ConvUnify #######################
+add_unittest_without_exec(test_ConvUnify
+    test_ConvUnify.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)
+    
+add_test(NAME test_ConvUnify
+    COMMAND test_ConvUnify)
+################# test_BatchNorm #######################
+add_unittest_without_exec(test_BatchNorm
+    test_BatchNorm.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)

+add_test(NAME test_BatchNorm
+    COMMAND test_BatchNorm)
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
    test_Evaluator.cpp

--- a/paddle/gserver/tests/img_conv_a.conf
+++ b/paddle/gserver/tests/img_conv_a.conf
@@ -34,6 +34,7 @@ conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
                      num_channels=8,
                      num_filters=16, stride=1,
                      bias_attr=True,
-                      act=LinearActivation())
+                      act=LinearActivation(),
+                      groups=2)

 outputs(concat, conv)
--- a/paddle/gserver/tests/img_conv_b.conf
+++ b/paddle/gserver/tests/img_conv_b.conf
@@ -24,7 +24,7 @@ proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
 concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())

 proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8, num_filters=16, stride=1)
+                       num_channels=8, num_filters=16, stride=1, groups=2)

 with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
    conv += proj

--- a/paddle/gserver/tests/img_conv_c.conf
+++ b/paddle/gserver/tests/img_conv_c.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8,
+                        num_filters=16, stride=1,
+                        bias_attr=False,
+                        act=ReluActivation(),
+                        layer_type="exconv")
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation(),
+                       layer_type="exconv")
+
+concat = concat_layer(input=[conv1, conv2])
+
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8,
+                      num_filters=16, stride=1,
+                      bias_attr=True,
+                      act=LinearActivation(),
+                      groups=2,
+                      layer_type="exconv")
+
+outputs(concat, conv)
--- a/paddle/gserver/tests/img_pool_a.conf
+++ b/paddle/gserver/tests/img_pool_a.conf
@@ -28,7 +28,6 @@ maxpool = img_pool_layer(input=conv,
                         stride_y=2,
                         padding=1,
                         padding_y=2,
-                         img_width=16,
                         pool_type=MaxPooling(),
 )
 avgpool = img_pool_layer(input=conv,
@@ -39,7 +38,6 @@ avgpool = img_pool_layer(input=conv,
                         stride_y=2,
                         padding=1,
                         padding_y=2,
-                         img_width=16,
                         pool_type=AvgPooling(),
 )


--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/gserver/layers/ExpandConvTransLayer.h"
+
+#include "TestUtil.h"
+#include "LayerGradUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_double(checkgrad_eps);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+P_DECLARE_bool(prev_batch_state);
+
+// Test that the batchNormLayer can be followed by a ConvLayer
+TEST(Layer, batchNorm) {
+    FLAGS_use_gpu = false;
+    TestConfig configBN;
+    const int CHANNELS = 6272;
+    const int IMG_SIZE = 1;
+    configBN.layerConfig.set_type("batch_norm");
+    configBN.layerConfig.set_name("bn");
+    configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+    configBN.layerConfig.set_active_type("relu");
+    configBN.biasSize = CHANNELS;
+    configBN.inputDefs.push_back({INPUT_DATA, "layer_0",
+                                /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
+                                /* paraSize= */ CHANNELS});
+
+    configBN.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean",
+                                    1, CHANNELS});
+    configBN.inputDefs.back().isStatic = true;
+    configBN.inputDefs.push_back({INPUT_DATA, "layer_2_running_var",
+                                    1, CHANNELS});
+    configBN.inputDefs.back().isStatic = true;
+
+    LayerInputConfig* input = configBN.layerConfig.add_inputs();
+    configBN.layerConfig.add_inputs();
+    configBN.layerConfig.add_inputs();
+
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(CHANNELS);
+    img_conf->set_img_size(IMG_SIZE);
+
+    // Setting up conv-layer config
+    TestConfig config;
+    config.biasSize = 64;
+    config.layerConfig.set_type("exconv");
+    config.layerConfig.set_num_filters(64);
+    config.layerConfig.set_partial_sum(1);
+    config.layerConfig.set_shared_biases(true);
+
+    config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
+    input = config.layerConfig.add_inputs();
+    ConvConfig* conv = input->mutable_conv_conf();
+    conv->set_filter_size(5);
+    conv->set_filter_size_y(5);
+    conv->set_channels(128);
+    conv->set_padding(1);
+    conv->set_padding_y(1);
+    conv->set_stride(2);
+    conv->set_stride_y(2);
+    conv->set_groups(1);
+    conv->set_filter_channels(conv->channels() / conv->groups());
+    conv->set_img_size(7);
+    conv->set_output_x(3);
+    config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                                config.layerConfig.num_filters());
+    config.layerConfig.set_name("conv");
+
+    // data layer initialize
+    std::vector<DataLayerPtr> dataLayers;
+    LayerMap layerMap;
+    vector<Argument> datas;
+    initDataLayer(configBN, &dataLayers, &datas, &layerMap, "batch_norm",
+                  100, false, false);
+    // test layer initialize
+    std::vector<ParameterPtr> parameters;
+    LayerPtr bnLayer;
+    initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
+
+    std::vector<ParameterPtr> parameters2;
+    LayerPtr convLayer;
+    initTestLayer(config, &layerMap, &parameters2, &convLayer);
+
+    bnLayer->forward(PASS_GC);
+    convLayer->forward(PASS_GC);
+
+    CHECK_EQ(convLayer->getOutputValue()->getHeight(), 100);
+    CHECK_EQ(convLayer->getOutputValue()->getWidth(), 576);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/gserver/layers/ExpandConvTransLayer.h"
+#include "paddle/math/MathUtils.h"
+
+#include "TestUtil.h"
+#include "LayerGradUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_double(checkgrad_eps);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+P_DECLARE_bool(prev_batch_state);
+
+// Do one forward pass of convTrans layer and check to see if its output
+// matches the given result
+MatrixPtr doOneConvTest(size_t imgSize, size_t output_x, size_t stride,
+                    size_t padding, size_t filter_size, size_t channel,
+                    size_t numfilters, size_t groups, MatrixPtr& inputData,
+                    real* param, bool useGpu) {
+  TestConfig config;
+  config.biasSize = numfilters;
+  if (useGpu) {
+    config.layerConfig.set_type("cudnn_conv");
+  } else {
+    config.layerConfig.set_type("exconv");
+  }
+  config.layerConfig.set_num_filters(numfilters);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  size_t weightSize = channel* filter_size * filter_size *
+      config.layerConfig.num_filters() / groups;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0",
+                              imgSize * imgSize * channel,
+                              weightSize});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(channel);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(groups);
+  conv->set_filter_channels(channel/groups);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(config, &dataLayers, &datas, &layerMap, "conv",
+                1, false, useGpu);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters, &convLayer);
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]->zeroMem();
+  convLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->copyFrom(param,
+      weightSize);
+  convLayer->forward(PASS_GC);
+
+  return convLayer->getOutputValue();
+}
+
+TEST(Layer, convParaUnified) {
+  #ifndef PADDLE_ONLY_CPU
+    MatrixPtr input, resultCpu, resultGpu;
+    input = Matrix::create(1, 4 * 4, false, false);
+    float inputData[] = {1, 2, 3, 4,
+                         5, 6, 7, 8,
+                         9, 10, 11, 12,
+                         13, 14, 15, 16};
+    float param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9,
+                     9, 8, 7, 6, 5, 4, 3, 2, 1};
+
+    input->setData(inputData);
+
+    resultCpu = doOneConvTest(/* imgSize */ 4,
+                   /* output_x */ 2,
+                   /* stride */ 1,
+                   /* padding */ 0,
+                   /* filter_size */ 3,
+                   /*channel*/ 1,
+                   /*numfilters*/ 2,
+                   /*groups*/ 1,
+                   input, param, false);
+
+    resultGpu = doOneConvTest(/* imgSize */ 4,
+                       /* output_x */ 2,
+                       /* stride */ 1,
+                       /* padding */ 0,
+                       /* filter_size */ 3,
+                       /*channel*/ 1,
+                       /*numfilters*/ 2,
+                       /*groups*/ 1,
+                       input, param, true);
+    checkMatrixEqual(resultCpu, resultGpu);
+
+    input = Matrix::create(1, 3 * 3 * 2, false, false);
+    float inputData2[] = {1, 2, 3,
+                          4, 5, 6,
+                          7, 8, 9,
+
+                          10, 11, 12,
+                          13, 14, 15,
+                          16, 17, 18};
+    float param2[] = {1, 2, 3, 4, 5, 6, 7, 8,
+                      8, 7, 6, 5, 4, 3, 2, 1};
+
+    input->setData(inputData2);
+
+    resultCpu = doOneConvTest(/* imgSize */ 3,
+                   /* output_x */ 2,
+                   /* stride */ 1,
+                   /* padding */ 0,
+                   /* filter_size */ 2,
+                   /*channel*/ 2,
+                   /*numfilters*/ 2,
+                   /*groups*/ 1,
+                   input, param2, false);
+
+    resultGpu = doOneConvTest(/* imgSize */ 3,
+                       /* output_x */ 2,
+                       /* stride */ 1,
+                       /* padding */ 0,
+                       /* filter_size */ 2,
+                       /*channel*/ 2,
+                       /*numfilters*/ 2,
+                       /*groups*/ 1,
+                       input, param2, true);
+    checkMatrixEqual(resultCpu, resultGpu);
+
+
+    float param3[] = {1, 2, 3, 4,
+                      4, 3, 2, 1};
+
+    resultCpu = doOneConvTest(/* imgSize */ 3,
+                   /* output_x */ 2,
+                   /* stride */ 1,
+                   /* padding */ 0,
+                   /* filter_size */ 2,
+                   /*channel*/ 2,
+                   /*numfilters*/ 2,
+                   /*groups*/ 2,
+                   input, param3, false);
+
+    resultGpu = doOneConvTest(/* imgSize */ 3,
+                       /* output_x */ 2,
+                       /* stride */ 1,
+                       /* padding */ 0,
+                       /* filter_size */ 2,
+                       /*channel*/ 2,
+                       /*numfilters*/ 2,
+                       /*groups*/ 2,
+                       input, param3, true);
+    checkMatrixEqual(resultCpu, resultGpu);
+  #endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -166,9 +166,8 @@ TEST(Projection, scaling) {
  }
 }

-#ifndef PADDLE_ONLY_CPU
-TEST(Projection, conv) {
-  const int NUM_FILTERS = 16;
+void testProjectionConv(size_t groups) {
+  const int NUM_FILTERS = 18;
  const int FILTER_SIZE = 2;
  const int FILTER_SIZE_Y = 3;
  const int CHANNELS = 3;
@@ -186,7 +185,7 @@ TEST(Projection, conv) {
  conv->set_padding_y(1);
  conv->set_stride(2);
  conv->set_stride_y(2);
-  conv->set_groups(1);
+  conv->set_groups(groups);
  conv->set_filter_channels(conv->channels() / conv->groups());
  conv->set_img_size(IMAGE_SIZE);
  int output_x = outputSize(conv->img_size(),
@@ -203,15 +202,21 @@ TEST(Projection, conv) {
  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
  conf.set_output_size(output_x * output_y * NUM_FILTERS);

-  testProjectionGrad(
-      conf,
-      INPUT_DATA,
-      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y,
-      /* batchSize */ 100,
-      true,
-      false,
-      NUM_FILTERS,
-      true);
+  testProjectionGrad(conf,
+                     INPUT_DATA,
+                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
+                         FILTER_SIZE_Y / groups,
+                     /* batchSize */ 100,
+                     true,
+                     false,
+                     NUM_FILTERS,
+                     true);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Projection, conv) {
+  testProjectionConv(1);
+  testProjectionConv(3);
 }
 #endif

@@ -223,9 +228,10 @@ TEST(Layer, BilinearInterpLayer) {

  LayerInputConfig* input = config.layerConfig.add_inputs();
  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
-  bilinear->set_img_size_x(32);
-  bilinear->set_img_size_y(32);
-  bilinear->set_num_channels(4);
+  ImageConfig* image = bilinear->mutable_image_conf();
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);

  for (auto useGpu : {false, true}) {
    for (auto outSize : {32, 64}) {
@@ -348,7 +354,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
  config.layerConfig.set_partial_sum(1);
  config.layerConfig.set_shared_biases(true);

-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 288});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
  LayerInputConfig* input = config.layerConfig.add_inputs();
  ConvConfig* conv = input->mutable_conv_conf();
  conv->set_filter_size(2);
@@ -361,12 +367,18 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
  conv->set_groups(1);
  conv->set_filter_channels(conv->channels() / conv->groups());
  conv->set_img_size(16);
+  conv->set_img_size_y(8);
  conv->set_output_x(outputSize(conv->img_size(),
                                conv->filter_size(),
                                conv->padding(),
                                conv->stride(),
                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
                              config.layerConfig.num_filters());

  testLayerGrad(config, "conv", 100, trans, useGpu);
@@ -466,10 +478,11 @@ TEST(Layer, maxoutLayer) {
  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
  LayerInputConfig* input = config.layerConfig.add_inputs();
  MaxOutConfig* maxout = input->mutable_maxout_conf();
+  ImageConfig* image = maxout->mutable_image_conf();

-  maxout->set_img_size_x(32);
-  maxout->set_img_size_y(32);
-  maxout->set_channels(4);
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
  maxout->set_groups(2);

  for (auto useGpu : {false, true}) {
@@ -981,7 +994,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
  config.layerConfig.set_type("norm");
  config.layerConfig.set_active_type("relu");

-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
  LayerInputConfig* input = config.layerConfig.add_inputs();
  NormConfig* norm = input->mutable_norm_conf();
  norm->set_norm_type(normType);
@@ -991,7 +1004,9 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
  norm->set_pow(0.75);
  norm->set_blocked(0);
  norm->set_img_size(14);
+  norm->set_img_size_y(7);
  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
  if (norm->norm_type() == "cmrnorm" ||
      norm->norm_type() == "cmrnorm-projection") {
    norm->set_scale(norm->scale() / norm->size());
@@ -999,7 +1014,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
  }

-  config.layerConfig.set_size(norm->output_x() * norm->output_x() *
+  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
                              norm->channels());
  config.biasSize = 0;

@@ -1100,11 +1115,12 @@ void testSppLayer(const string& poolType,
  SppConfig* sppConfig = input->mutable_spp_conf();
  sppConfig->set_pool_type(poolType);
  sppConfig->set_pyramid_height(pyramidHeight);
-  sppConfig->set_channels(16);
-  sppConfig->set_img_size(10);
-  sppConfig->set_img_size_y(20);
+  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
+  imageConfig->set_channels(16);
+  imageConfig->set_img_size(10);
+  imageConfig->set_img_size_y(20);
  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
-  config.layerConfig.set_size(outputSize * sppConfig->channels());
+  config.layerConfig.set_size(outputSize * imageConfig->channels());
  testLayerGrad(config, "spp", 100, trans, useGpu);
 }

@@ -1414,13 +1430,15 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
  TestConfig config;
  const int CHANNELS = 10;
  const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+  config.layerConfig.set_size(size);
  config.layerConfig.set_active_type("sigmoid");
  config.biasSize = CHANNELS;
  config.inputDefs.push_back({INPUT_DATA,
                              "layer_0",
-                              /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
+                              /* dim= */ size,
                              /* paraSize= */ CHANNELS});

  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
@@ -1435,6 +1453,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
  ImageConfig* img_conf = input->mutable_image_conf();
  img_conf->set_channels(CHANNELS);
  img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);

  testLayerGrad(config,
                "batch_norm",
@@ -1461,6 +1480,7 @@ TEST(Operator, conv) {
  const int FILTER_SIZE_Y = 3;
  const int CHANNELS = 3;
  const int IMAGE_SIZE = 16;
+  const int IMAGE_SIZE_Y = 8;
  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
  operatorConf.set_type("conv");
  ConvConfig* conv = operatorConf.mutable_conv_conf();
@@ -1475,19 +1495,22 @@ TEST(Operator, conv) {
  conv->set_groups(1);
  conv->set_filter_channels(conv->channels() / conv->groups());
  conv->set_img_size(IMAGE_SIZE);
-  int output_x = outputSize(conv->img_size(),
-                            conv->filter_size(),
-                            conv->padding(),
-                            conv->stride(),
-                            /* caffeMode */ true);
-  conv->set_output_x(output_x);
-  config.layerConfig.set_size(output_x * output_x *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
                              NUM_FILTERS);

  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE * CHANNELS, 0});
+      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
  config.inputDefs.push_back(
      {INPUT_DATA,
       "layer_1",

--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -255,6 +255,16 @@ TEST(Compare, img_conv) {
  compareNetwork(config_file_a, config_file_b);
  FLAGS_use_gpu = useGpu;
 }
+
+// Test cudnn_conv and exconv give the same result
+TEST(Compare, img_conv2) {
+  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./gserver/tests/img_conv_c.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
 #endif

 P_DEFINE_string(config_file_a, "", "config of one network to compare");

--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1584,11 +1584,6 @@ void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
  applyRow(aggregate::min(), b);
 }

-template<>
-void BaseMatrixT<real>::sumCols(BaseMatrixT& b) {
-  applyCol(aggregate::sum(), b);
-}
-
 template<>
 void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
  applyCol(aggregate::max(), b);

--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -1018,8 +1018,6 @@ public:
  /// calculate the minimum value of each row of the matrix b.
  void minRows(BaseMatrixT& b);

-  /// calculate the sum of each column of the matrix b.
-  void sumCols(BaseMatrixT& b);
  /// calculate the maximum value of each column of the matrix b.
  void maxCols(BaseMatrixT& b);
  /// calculate the minimum value of each column of the matrix b.

--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -2,7 +2,7 @@

 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
-add_simple_unittest(test_matrix)
+add_simple_unittest(test_SparseMatrix)

 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
@@ -14,4 +14,6 @@ add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
 add_simple_unittest(test_FPException)
-add_simple_unittest(test_GpuProfiler)
\ No newline at end of file
+add_simple_unittest(test_GpuProfiler)
+add_simple_unittest(test_BaseMatrix)
+add_simple_unittest(test_Matrix)
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a TensorCheck template function, which can be used to
+ * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
+ */
+
+#include <cmath>
+#include "paddle/math/Matrix.h"
+
+namespace autotest {
+
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::VectorT;
+using paddle::CpuVectorT;
+using paddle::GpuVectorT;
+
+class AssertEqual {
+public:
+  AssertEqual(real err = 0) : err_(err) {}
+
+  inline bool operator()(real a, real b) {
+    if (err_ == 0) {
+      if (a != b) {
+        return false;
+      }
+    } else {
+      if (std::fabs(a - b) > err_) {
+        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+private:
+  real err_;
+};
+
+template <typename Tensor>
+class CopyToCpu;
+
+template <>
+class CopyToCpu<CpuMatrix> {
+public:
+  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
+  const CpuMatrix& copiedArg() const { return arg_; }
+
+private:
+  const CpuMatrix& arg_;
+};
+
+template <>
+class CopyToCpu<GpuMatrix> {
+public:
+  explicit CopyToCpu(const GpuMatrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+private:
+  CpuMatrix arg_;
+};
+
+template <>
+class CopyToCpu<Matrix> {
+public:
+  explicit CopyToCpu(const Matrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+private:
+  CpuMatrix arg_;
+};
+
+template <typename T>
+class CopyToCpu<CpuVectorT<T>> {
+public:
+  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
+  const CpuVectorT<T>& copiedArg() const { return arg_; }
+
+private:
+  const CpuVectorT<T>& arg_;
+};
+
+template <typename T>
+class CopyToCpu<GpuVectorT<T>> {
+public:
+  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename T>
+class CopyToCpu<VectorT<T>> {
+public:
+  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare,
+                 const CpuMatrix& matrix1,
+                 const CpuMatrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (!compare(a, b)) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+template <typename AssertEq, class T>
+void TensorCheck(AssertEq compare,
+                 const CpuVectorT<T>& vector1,
+                 const CpuVectorT<T>& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const T* data1 = vector1.getData();
+  const T* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (!compare(a, b)) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+template <typename AssertEq, typename Tensor1, typename Tensor2>
+void TensorCheck(AssertEq compare,
+                 const Tensor1& tensor1,
+                 const Tensor2& tensor2) {
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, real args1, real args2) {
+  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
+                                         << ", args2 = " << args2;
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
+  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
+                          << ", args2 = " << args2;
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
+  AssertEqual compare(0);
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
+#ifndef PADDLE_TYPE_DOUBLE
+  AssertEqual compare(1e-3);
+#else
+  AssertEqual compare(1e-10);
+#endif
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+}  // namespace autotest
--- a/paddle/math/tests/TestUtils.h
+++ b/paddle/math/tests/TestUtils.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a AutoCompare calss to simplify the comparison
+ * of CPU and GPU member functions.
+ *
+ * This takes two steps
+ * 1. Construct an AutoCompare object.
+ *    When constructing an AutoCompare object, you can set the err argument
+ * to specify the maximum error for CPU and GPU functions.
+ *
+ * 2. Use the template functions cmpWithArg or cmpWithoutArg.
+ * A. [cmpWithArg] Requires the caller construct the cpu arguments.
+ *
+ *  AutoCompare test;
+ *  Init Argument arg1,arg2...
+ *  test.cmpWithArg(function, arg1, arg2....)
+ *
+ * B. [cmpWithoutArg] The caller do not need construct arguments.
+ *    If matrix used in these functions arguments is the same size.
+ *    Such as the element wise function and the aggregate function
+ *    defined in the BaseMatrix.cpp.
+ *
+ *  AutoCompare test;
+ *  test.cmpWithoutArg<I...>(function, height, width)
+*/
+
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "TensorCheck.h"
+
+namespace autotest {
+
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using paddle::CpuSparseMatrix;
+using paddle::GpuSparseMatrix;
+
+template <typename T1, typename T2>
+class ReplaceType {
+public:
+  typedef T1 type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, CpuMatrix> {
+public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, GpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, CpuMatrix> {
+public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, GpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+
+// construct a argument
+template <typename T>
+T construct(int height, int width);
+
+template <>
+float construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+double construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+size_t construct(int height, int width) {
+  size_t offset = std::rand() % (height < width ? height : width);
+  return offset;
+}
+
+template <>
+CpuMatrix construct(int height, int width) {
+  CpuMatrix a(height, width);
+  return a;
+}
+
+template <>
+GpuMatrix construct(int height, int width) {
+  GpuMatrix a(height, width);
+  return a;
+}
+
+// init a argument
+template <typename T>
+void init(T& v) {
+  return;
+}
+
+template <>
+void init(CpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+template <>
+void init(GpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+// init a tuple which contains a set of arguments.
+template <std::size_t I = 0, typename... Args>
+inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
+    std::tuple<Args...>& t) {}
+
+template <std::size_t I = 0, typename... Args>
+    inline typename std::enable_if <
+    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
+  init(std::get<I>(t));
+  initTuple<I + 1>(t);
+}
+
+// copy a argument, copy src to dest
+template <typename T1, typename T2>
+void copy(T1& dest, T2& src) {
+  dest = src;
+}
+
+template <>
+void copy(GpuMatrix& dest, CpuMatrix& src) {
+  dest.copyFrom(src);
+}
+
+// copy a tuple, copy src to dest
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
+    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
+
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+    inline typename std::enable_if <
+    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
+                                              std::tuple<Args2...>& src) {
+  copy(std::get<I>(dest), std::get<I>(src));
+  copyTuple<I + 1>(dest, src);
+}
+
+// call member function
+template <typename C,
+          typename FC,
+          typename R,
+          typename... FArgs,
+          typename... Args>
+R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
+  return (obj.*f)(args...);
+}
+
+template <typename T>
+class ReturnType {
+public:
+  typedef T type;
+};
+
+template <>
+class ReturnType<CpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReturnType<CpuIVector> {
+public:
+  typedef GpuIVector type;
+};
+
+template <>
+class ReturnType<CpuSparseMatrix> {
+public:
+  typedef GpuSparseMatrix type;
+};
+
+template <typename T>
+typename ReturnType<T>::type autoArgs(T& v) {
+  return v;
+}
+
+template <>
+GpuMatrix autoArgs(CpuMatrix& v) {
+  GpuMatrix a(v.getHeight(), v.getWidth());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuIVector autoArgs(CpuIVector& v) {
+  GpuIVector a(v.getSize());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
+  GpuSparseMatrix a(v.getHeight(),
+                    v.getWidth(),
+                    v.getElementCnt(),
+                    v.getValueType(),
+                    v.getFormat());
+  a.copyFrom(v, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return a;
+}
+
+class AutoCompare {
+public:
+  /**
+   * err is the allowed calculation error.
+   * The smaller the value of err,
+   * the stricter the comparison is between CPU and GPU calculations.
+   */
+  AutoCompare(size_t height, size_t width, real err = 1e-3)
+      : cpu(height, width), gpu(height, width), compare(err) {
+    init(cpu);
+    copy(gpu, cpu);
+  }
+
+  template <typename C, typename R, typename... FArgs, typename... Args>
+  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
+    static_assert(sizeof...(FArgs) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    call(cpu, f, args...);
+    call(gpu, f, autoArgs(args)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+  template <std::size_t... I, typename C, typename R, typename... Args>
+  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
+    static_assert(sizeof...(I) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    (void)height;
+    (void)width;
+    auto tuple1 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            CpuMatrix>::type>(height, width)...);
+
+    auto tuple2 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            GpuMatrix>::type>(height, width)...);
+
+    initTuple(tuple1);
+    copyTuple(tuple2, tuple1);
+
+    call(cpu, f, std::get<I>(tuple1)...);
+    call(gpu, f, std::get<I>(tuple2)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+protected:
+  CpuMatrix cpu;
+  GpuMatrix gpu;
+  AssertEqual compare;
+};
+
+}  // namespace autotest
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+/**
+ * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
+ * implementation of CPU and GPU member function in
+ * BaseMatrix.cpp and Matrix.cpp.
+ */
+
+#include <gtest/gtest.h>
+#include "paddle/math/BaseMatrix.h"
+#include "TestUtils.h"
+
+using paddle::BaseMatrix;
+using paddle::Matrix;
+using autotest::AutoCompare;
+
+// Test all void (BaseMatrix::*)() function
+TEST(BaseMatrix, void) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)()) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg(f, height, width);
+      };
+
+      compare(&BaseMatrix::neg);
+      compare(&BaseMatrix::exp);
+      compare(&BaseMatrix::log);
+      compare(&BaseMatrix::sqrt);
+      compare(&BaseMatrix::square);
+      compare(&BaseMatrix::reciprocal);
+      compare(&BaseMatrix::abs);
+      compare(&BaseMatrix::sign);
+      compare(&BaseMatrix::zero);
+      compare(&BaseMatrix::one);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real) function
+TEST(BaseMatrix, real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::pow);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::biggerThanScalar);
+      compare(&BaseMatrix::downClip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::relu);
+      compare(&BaseMatrix::reluDerivative);
+      compare(&BaseMatrix::softrelu);
+      compare(&BaseMatrix::softreluDerivative);
+      compare(&BaseMatrix::brelu);
+      compare(&BaseMatrix::breluDerivative);
+      compare(&BaseMatrix::square);
+      compare(&BaseMatrix::squareDerivative);
+      compare(&BaseMatrix::tanh);
+      compare(&BaseMatrix::tanhDerivative);
+      compare(&BaseMatrix::reciprocal);
+      compare(&BaseMatrix::reciprocalDerivative);
+      compare(&BaseMatrix::abs);
+      compare(&BaseMatrix::absDerivative);
+      compare(&BaseMatrix::sigmoid);
+      compare(&BaseMatrix::sigmoidDerivative);
+      compare(&BaseMatrix::expDerivative);
+      compare(&BaseMatrix::sign);
+      compare(&BaseMatrix::exp);
+      compare(&BaseMatrix::log);
+      compare(&BaseMatrix::sqrt);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareMul);
+      compare(&BaseMatrix::addColVector);
+      compare(&BaseMatrix::addRowVector);
+      compare(&BaseMatrix::mulRowVector);
+      compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::addP2P);
+      compare(&BaseMatrix::invSqrt);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real, real) function
+TEST(BaseMatrix, real_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::clip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
+TEST(BaseMatrix, BaseMatrix_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::addBias);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::pow);
+      compare(&BaseMatrix::addScalar);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::scalarDiv);
+      compare(&BaseMatrix::addSquare);
+      compare(&BaseMatrix::isEqualTo);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height,
+                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::softCrossEntropy);
+      compare(&BaseMatrix::softCrossEntropyBp);
+      compare(&BaseMatrix::binaryLabelCrossEntropy);
+      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::add2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotDiv);
+      compare(&BaseMatrix::logisticRegressionLoss);
+      compare(&BaseMatrix::logisticRegressionLossBp);
+      compare(&BaseMatrix::biggerThan);
+      compare(&BaseMatrix::max);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareSquare);
+    }
+  }
+}
+
+void TestEelementWise(size_t height, size_t width) {
+  AutoCompare rowScale(height, width);
+  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
+
+  AutoCompare rowDotMul(height, width);
+  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
+
+  AutoCompare binaryClassificationError(height, width);
+  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
+      &BaseMatrix::binaryClassificationError, height, width);
+
+  AutoCompare sumOfSquaresBp(height, width);
+  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
+}
+
+void TestAggregateToRow(size_t height, size_t width) {
+  AutoCompare maxCols(1, width);
+  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
+
+  AutoCompare minCols(1, width);
+  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
+
+  AutoCompare addDotMulVMM(1, width);
+  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
+
+  AutoCompare sumCols(1, width);
+  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
+
+  AutoCompare collectBias(1, width);
+  collectBias.cmpWithoutArg<0, 1>(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
+      height,
+      width);
+}
+
+void TestAggregateToCol(size_t height, size_t width) {
+  AutoCompare maxRows(height, 1);
+  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
+
+  AutoCompare minRows(height, 1);
+  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
+
+  AutoCompare sumRows(height, 1);
+  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
+
+  AutoCompare sumOfSquares(height, 1);
+  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
+}
+
+TEST(BaseMatrix, Other) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      TestEelementWise(height, width);
+      TestAggregateToRow(height, width);
+      TestAggregateToCol(height, width);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+/**
+ * This test file use autotest::AutoCompare and cmpWithArg to compares the
+ * implementation of CPU and GPU member function in Matrix.cpp.
+ */
+
+#include <gtest/gtest.h>
+#include "TestUtils.h"
+
+using paddle::BaseMatrix;
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::CpuIVector;
+using paddle::CpuSparseMatrix;
+using autotest::AutoCompare;
+
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+
+  AutoCompare forward(numSamples, outWidth);
+  CpuMatrix arg1(numSamples, inWidth);
+  arg1.randomizeUniform();
+  forward.cmpWithArg(&Matrix::bilinearForward,
+                     arg1,
+                     imgSizeH,
+                     imgSizeW,
+                     2 * imgSizeH,
+                     2 * imgSizeW,
+                     channels,
+                     ratioH,
+                     ratioW);
+
+  AutoCompare backward(numSamples, inWidth);
+  CpuMatrix arg2(numSamples, outWidth);
+  arg2.randomizeUniform();
+  backward.cmpWithArg(&Matrix::bilinearBackward,
+                      arg2,
+                      2 * imgSizeH,
+                      2 * imgSizeW,
+                      imgSizeH,
+                      imgSizeW,
+                      channels,
+                      ratioH,
+                      ratioW);
+}
+
+TEST(Matrix, BilinearFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
+        }
+      }
+    }
+  }
+}
+
+void testMatrixAddBias(int height, int width, real scale) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(1, width);
+  arg1.randomizeUniform();
+  test.cmpWithArg(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::addBias),
+      arg1,
+      scale);
+}
+
+void testMatrixAddDotMulMMV(int height, int width) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(1, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  test.cmpWithArg(&BaseMatrix::addDotMulMMV, arg1, arg2);
+}
+
+TEST(Matrix, unary) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      testMatrixAddBias(height, width, 1.0);
+      testMatrixAddBias(height, width, 3.5);
+      testMatrixAddDotMulMMV(height, width);
+    }
+  }
+}
+
+void testMatrixAddAtOffset(int height, int width1, int width2, int offset) {
+  AutoCompare test(height, width2);
+  CpuMatrix arg1(height, width1);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::addAtOffset, arg1, offset);
+}
+
+void testMatrixAssignAtOffset(int height, int width1, int width2, int offset) {
+  AutoCompare test(height, width2);
+  CpuMatrix arg1(height, width1);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::assignAtOffset, arg1, offset);
+}
+
+TEST(Matrix, AtOffset) {
+  for (auto height : {1, 11, 73, 128, 200}) {
+    for (auto width1 : {1, 32, 100, 512, 1000}) {
+      for (auto width2 : {1, 32, 100, 512, 1000}) {
+        int columnOffset = 0;
+        int offset = std::abs(width1 - width2);
+        if (offset) {
+          columnOffset = std::rand() % offset;
+        }
+        VLOG(3) << " height=" << height << " width1=" << width1
+                << " width2=" << width2 << " columnOffset = " << columnOffset;
+        testMatrixAddAtOffset(height, width1, width2, columnOffset);
+        testMatrixAssignAtOffset(height, width1, width2, columnOffset);
+      }
+    }
+  }
+}
+
+void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
+  AutoCompare test(numSamples, inputDim);
+  CpuMatrix arg1(tableSize, inputDim);
+  CpuIVector arg2(numSamples);
+  arg1.randomizeUniform();
+  arg2.rand(tableSize);
+  test.cmpWithArg(&Matrix::selectRows, arg1, arg2);
+}
+
+TEST(Matrix, tableProjection) {
+  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
+    for (auto tableSize : {10, 100}) {
+      for (auto inputDim : {20, 50}) {
+        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
+                << " inputDim=" << inputDim;
+        testMatrixSelectRows(numSamples, tableSize, inputDim);
+      }
+    }
+  }
+}
+
+void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
+  AutoCompare test(outHeight, width);
+  CpuMatrix arg1(inHeight, width);
+  CpuIVector arg2(outHeight);
+  arg1.randomizeUniform();
+  arg2.rand(inHeight);
+  test.cmpWithArg(&Matrix::copyByRowIndex, arg1, arg2);
+}
+
+TEST(Matrix, copyByRowIndex) {
+  for (auto outHeight : {31, 500, 1000}) {
+    for (auto inHeight : {17, 257, 500, 1200}) {
+      for (auto width : {512, 1024}) {
+        VLOG(3) << outHeight << " " << inHeight << " " << width;
+        testMatrixCopyByRowIndex(outHeight, inHeight, width);
+      }
+    }
+  }
+}
+
+void testCosSim(int heightX, int heightY, int width, real scale) {
+  AutoCompare test(heightX, 1);
+  CpuMatrix arg1(heightX, width);
+  CpuMatrix arg2(heightY, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg2.add(-0.5);
+  test.cmpWithArg(&Matrix::cosSim, arg1, arg2, scale);
+}
+
+TEST(Matrix, cosSim) {
+  for (auto heightX : {10, 100, 1000}) {
+    for (auto heightY : {1, heightX}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSim(heightX, heightY, width, scale);
+        }
+      }
+    }
+  }
+}
+
+void testParamReluForward(int height, int width, int w_height, int w_width) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(w_height, w_width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg1.add(-0.5);
+  test.cmpWithArg(&Matrix::paramReluForward, arg1, arg2);
+}
+
+void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
+  AutoCompare test(w_height, w_width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(height, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg2.add(-0.5);
+  test.cmpWithArg(&Matrix::paramReluBackwardW, arg1, arg2);
+}
+
+TEST(Matrix, paramRelu) {
+  for (auto height : {10, 100}) {
+    for (auto width : {10, 100}) {
+      for (auto w_height : {1, 2}) {
+        for (auto w_width : {1, 2}) {
+          testParamReluForward(height, width, w_height, w_width);
+          testParamReluBackwardW(height, width, w_height, w_width);
+        }
+      }
+    }
+  }
+}
+
+void testAddSharedBias(int numSamples, int dim, int channel) {
+  AutoCompare test(numSamples, dim);
+  CpuMatrix arg1(1, channel);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::addSharedBias, arg1, 1.0);
+}
+
+void testCollectSharedBias(int numSamples, int dim, int channel) {
+  AutoCompare test(1, channel);
+  CpuMatrix arg1(numSamples, dim);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::collectSharedBias, arg1, 1.0);
+}
+
+TEST(Matrix, sharedBias) {
+  for (auto numSamples : {1, 100, 520}) {
+    for (auto dim : {100 * 16, 100 * 32}) {
+      for (auto channel : {8, 16}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " channel=" << channel;
+        testAddSharedBias(numSamples, dim, channel);
+        testCollectSharedBias(numSamples, dim, channel);
+      }
+    }
+  }
+}
+
+void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
+  AutoCompare forward(numSamples, 1);
+  CpuMatrix arg1(numSamples, dim);
+  CpuSparseMatrix arg2(
+      numSamples, dim, numSamples, paddle::NO_VALUE, paddle::SPARSE_CSR);
+
+  CpuMatrix output1(numSamples, dim);
+  output1.randomizeUniform();
+  output1.softmax(arg1);
+  for (int i = 0; i < numSamples; i++) {
+    const unsigned int id = std::rand() % dim;
+    arg2.setRow(i, 1, &id, nullptr);
+  }
+  forward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropy, arg1, arg2);
+
+  AutoCompare backward(numSamples, dim);
+  backward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropyBp, arg1, arg2);
+}
+
+TEST(Matrix, multiBinaryCrossEntropy) {
+  for (auto numSamples : {100, 1000, 10000}) {
+    for (auto dim : {100, 1000, 10000}) {
+      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
+      testMultiBinaryLabelCrossEntropy(numSamples, dim);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
--- a/paddle/math/tests/test_matrix.cpp
+++ b/paddle/math/tests/test_matrix.cpp
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -22,163 +22,12 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/utils/Stat.h"
+#include "TensorCheck.h"

 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
-
-template <class T>
-void VectorCheckEqual(const VectorT<T>& vector1, const VectorT<T>& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const T* data1 = vector1.getData();
-  const T* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    if (data1[i] != data2[i]) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void MatrixCheckEqual(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (data1[i * width + j] != data2[i * width + j]) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void testBilinearFwdBwd(int numSamples,
-                        int imgSizeH,
-                        int imgSizeW,
-                        int channels) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
-  real ratioH = 0.5;
-  real ratioW = 0.5;
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  target->bilinearForward(*input,
-                          imgSizeH,
-                          imgSizeW,
-                          2 * imgSizeH,
-                          2 * imgSizeW,
-                          channels,
-                          ratioH,
-                          ratioW);
-  targetGpu->bilinearForward(*inputGpu,
-                             imgSizeH,
-                             imgSizeW,
-                             2 * imgSizeH,
-                             2 * imgSizeW,
-                             channels,
-                             ratioH,
-                             ratioW);
-
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->bilinearBackward(*targetGrad,
-                              2 * imgSizeH,
-                              2 * imgSizeW,
-                              imgSizeH,
-                              imgSizeW,
-                              channels,
-                              ratioH,
-                              ratioW);
-  inputGpuGrad->bilinearBackward(*targetGpuGrad,
-                                 2 * imgSizeH,
-                                 2 * imgSizeW,
-                                 imgSizeH,
-                                 imgSizeW,
-                                 channels,
-                                 ratioH,
-                                 ratioW);
-
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
-}
-
-TEST(Matrix, BilinearFwdBwd) {
-  for (auto numSamples : {5, 10}) {
-    for (auto channels : {8, 16}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
-        }
-      }
-    }
-  }
-}
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;

 void testMatrixProjectionForward(int contextStart,
                                 int contextLength,
@@ -232,12 +81,7 @@ void testMatrixProjectionForward(int contextStart,
                                      beginPad,
                                      padding);

-  // check
-  MatrixPtr outputCheck =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  outputCheck->copyFrom(*gpuOutput);
-
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  TensorCheckEqual(*cpuOutput, *gpuOutput);
 }

 void testMatrixProjectionBackward(int contextStart,
@@ -294,15 +138,9 @@ void testMatrixProjectionBackward(int contextStart,
                                                   beginPad);
  }

-  // check
-  MatrixPtr inputGradCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  inputGradCheck->copyFrom(*gpuInputGrad);
-  MatrixCheckErr(*cpuInputGrad, *inputGradCheck);
-
+  TensorCheckErr(*cpuInputGrad, *gpuInputGrad);
  if (padding) {
-    MatrixPtr weightGradChcek = std::make_shared<CpuMatrix>(pad, inputDim);
-    weightGradChcek->copyFrom(*gpuWeightGrad);
-    MatrixCheckErr(*cpuWeightGrad, *weightGradChcek);
+    TensorCheckErr(*cpuWeightGrad, *gpuWeightGrad);
  }
 }

@@ -361,15 +199,8 @@ void testMatrixMaxSequence(int batchSize, int inputDim) {
  cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
  gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);

-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
-
-  IVectorPtr indexCheck = nullptr;
-  IVector::resizeOrCreate(indexCheck, newBatchSize * inputDim, false);
-  indexCheck->copyFrom(*gpuIndex);
-  VectorCheckEqual(*cpuIndex, *indexCheck);
+  TensorCheckEqual(*cpuOutput, *gpuOutput);
+  TensorCheckEqual(*cpuIndex, *gpuIndex);

  // backward
  MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
@@ -385,10 +216,7 @@ void testMatrixMaxSequence(int batchSize, int inputDim) {
  cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
  gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);

-  // check
-  MatrixPtr inputGradCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  inputGradCheck->copyFrom(*gpuInputGrad);
-  MatrixCheckEqual(*cpuInputGrad, *inputGradCheck);
+  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
 }

 TEST(Matrix, maxSequence) {
@@ -431,6 +259,8 @@ void testMatrixZeroAtOffset(int height, int width) {
  int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
  int numColumns = rand() % (width - columnOffset);  // NOLINT

+  if (numColumns == 0) return;
+
  cpuA->zeroAtOffset(columnOffset, numColumns);
  gpuA->zeroAtOffset(columnOffset, numColumns);

@@ -442,10 +272,8 @@ void testMatrixZeroAtOffset(int height, int width) {
    }
  }

-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-  MatrixCheckEqual(*cpuA, *cpuTest);
+  TensorCheckEqual(*cpuA, *gpuA);
+  TensorCheckEqual(*cpuA, *cpuTest);
 }

 void testMatrixDeepSwap(int height, int width) {
@@ -462,303 +290,8 @@ void testMatrixDeepSwap(int height, int width) {
  // swap matrix cpuA and cpuB
  cpuA->deepSwap(*cpuB);

-  MatrixCheckEqual(*cpuA, *cpuCopyB);
-  MatrixCheckEqual(*cpuB, *cpuCopyA);
-}
-
-void testMatrixBinaryAdd(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA->add(*cpuB);
-  gpuA->add(*gpuB);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-void testMatrixAssign(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuA->assign(2.5);
-  gpuA->assign(2.5);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-
-void testMatrixAdd(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuA->add(2.5);
-  gpuA->add(2.5);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-void testMatrixSqrt(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuA->sqrt();
-  gpuA->sqrt();
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixTanhDerivative(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA->tanhDerivative(*cpuB);
-  gpuA->tanhDerivative(*gpuB);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixTanh(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA->tanh(*cpuB);
-  gpuA->tanh(*gpuB);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixTernarySub(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  cpuA->sub(*cpuB, *cpuC);
-  gpuA->sub(*gpuB, *gpuC);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-void testMatrixSumOfSquaresBp(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  cpuA->sumOfSquaresBp(*cpuB, *cpuC);
-  gpuA->sumOfSquaresBp(*gpuB, *gpuC);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixBinaryRowScale(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, 1);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr gpuA1 = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB1 = std::make_shared<GpuMatrix>(height, 1);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  gpuA1->copyFrom(*cpuA);
-  gpuB1->copyFrom(*cpuB);
-
-  cpuA->addColVector(*cpuB);
-  gpuA->addColVector(*gpuB);
-  cpuA1->addColumnVector(*cpuB1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-
-  MatrixCheckEqual(*cpuA, *cpuA1);
-}
-
-void testMatrixAddBias(int height, int width, real scale) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  cpuA->addBias(*cpuB, scale);
-  gpuA->addBias(*gpuB, scale);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixTernaryRowScale(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-
-  int columnOffset = rand() % width;  // NOLINT
-
-  cpuA->rowScale(columnOffset, *cpuB, *cpuC);
-  gpuA->rowScale(columnOffset, *gpuB, *gpuC);
-  cpuA1->rowScale2(columnOffset, *cpuB1, *cpuC1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-
-  MatrixCheckEqual(*cpuA, *cpuA1);
-}
-
-void testMatrixTernaryRowDotMul(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
-
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  int columnOffset = rand() % width;  // NOLINT
-
-  cpuA->rowDotMul(columnOffset, *cpuB, *cpuC);
-  gpuA->rowDotMul(columnOffset, *gpuB, *gpuC);
-  cpuA1->rowDotMul2(columnOffset, *cpuB1, *cpuC1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *cpuA1);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixAddDotMulMMV(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(1, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(1, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-
-  cpuA->addDotMulMMV(*cpuB, *cpuC);
-  gpuA->addDotMulMMV(*gpuB, *gpuC);
-  cpuA1->addDotMulMMV2(*cpuB1, *cpuC1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-  MatrixCheckEqual(*cpuA, *cpuA1);
+  TensorCheckEqual(*cpuA, *cpuCopyB);
+  TensorCheckEqual(*cpuB, *cpuCopyA);
 }

 void testMatrixTranspose(int height, int width) {
@@ -772,9 +305,7 @@ void testMatrixTranspose(int height, int width) {
  cpu->transpose(cpuT, false);
  gpu->transpose(gpuT, false);

-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(width, height);
-  outputCheck->copyFrom(*gpuT);
-  MatrixCheckEqual(*cpuT, *outputCheck);
+  TensorCheckEqual(*cpuT, *gpuT);
 }

 void testMatrixInverse(int height) {
@@ -795,530 +326,127 @@ void testMatrixInverse(int height) {
  cpu->inverse(cpuI, false);
  gpu->inverse(gpuI, false);

-  outputCheck->copyFrom(*gpuI);
-  MatrixCheckErr(*cpuI, *outputCheck);
+  TensorCheckErr(*cpuI, *gpuI);

  outputCheck->mul(cpu, cpuI);
-  cpu->setDiag(1.0);
-  MatrixCheckErr(*cpu, *outputCheck);
-}
-
-TEST(Matrix, unary) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      // applyUnary
-      testMatrixAssign(height, width);
-      testMatrixAdd(height, width);
-      testMatrixSqrt(height, width);
-
-      // applyBinary
-      testMatrixBinaryAdd(height, width);
-      testMatrixTanh(height, width);
-      testMatrixTanhDerivative(height, width);
-      testMatrixDeepSwap(height, width);
-
-      // applyTernary
-      testMatrixTernarySub(height, width);
-      testMatrixSumOfSquaresBp(height, width);
-
-      // asRowVector
-      testMatrixAddBias(height, width, 1.0);
-      testMatrixAddBias(height, width, 3.5);
-      testMatrixAddDotMulMMV(height, width);
-
-      // asColVector
-      testMatrixTernaryRowScale(height, width);
-      testMatrixBinaryRowScale(height, width);
-
-      // sum
-      testMatrixGetSum(height, width);
-
-      // transpose
-      testMatrixTranspose(height, width);
-    }
-    // inverse
-    testMatrixInverse(height);
-  }
-}
-
-void testMatrixSoftmax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckErr(*cpuOutput, *outputCheck);
-}
-
-void testSequenceSoftmax(int batchSize) {
-  // forward
-  int inputDim = 1;
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
-  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
-
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  outputCheck->copyFrom(*gpuInput);
-  MatrixCheckErr(*cpuInput, *outputCheck);
-}
-
-void testMatrixSoftmaxThreshold(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  cpuInput->getData()[0] = 100.0;
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  // check output zero
-  int cpuCount = 0;
-  int gpuCount = 0;
-  auto zeroNum = [](MatrixPtr out, int& count) {
-    for (size_t i = 0; i < out->getHeight(); i++) {
-      for (size_t j = 0; j < out->getWidth(); j++) {
-        if (out->getElement(i, j) == 0) count++;
-      }
-    }
-  };
-  zeroNum(cpuOutput, cpuCount);
-  zeroNum(outputCheck, gpuCount);
-  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
-  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
-}
-
-void testMatrixSoftmaxBp(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuOutput->softmaxBackward(*gpuInput);
-
-  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
-  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
-  sftMaxSum->colMerge(*sftMaxDot);
-  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckErr(*cpuOutput, *outputCheck);
-}
-
-TEST(Matrix, softmax) {
-  for (auto height : {1, 11, 73, 128, 200}) {
-    for (auto width : {1, 32, 100, 512, 1000}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixSoftmax(height, width);
-      testMatrixSoftmaxBp(height, width);
-      testMatrixSoftmaxThreshold(height, width);
-    }
-    testSequenceSoftmax(height);
-  }
-}
-
-void testMatrixAddDotMulVMM(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-
-  if (!endCol) {
-    cpuA->addDotMulVMM(*cpuB, *cpuC);
-    gpuA->addDotMulVMM(*gpuB, *gpuC);
-    cpuA1->addDotMulVMM2(*cpuB1, *cpuC1);
-
-    MatrixCheckErr(*cpuA, *cpuA1);
-  } else {
-    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subCpuC = cpuC->subColMatrix(0, endCol);
-    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuC = gpuC->subColMatrix(0, endCol);
-    subCpuA->addDotMulVMM(*subCpuB, *subCpuC);
-    subGpuA->addDotMulVMM(*subGpuB, *subGpuC);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixRowSum(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA1 = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB1 = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  gpuA1->copyFrom(*cpuA);
-  gpuB1->copyFrom(*cpuB);
-
-  cpuA->colMerge(*cpuB);
-  gpuA->colMerge(*gpuB);
-
-  cpuB1->rowSum(*cpuA1);
-  gpuB1->rowSum(*gpuA1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-  outputCheck->copyFrom(*gpuA1);
-  MatrixCheckErr(*cpuA1, *outputCheck);
-}
-
-void testMatrixRowMax(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  if (!endCol) {
-    cpuB->rowMax(*cpuA);
-    gpuB->rowMax(*gpuA);
-  } else {
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    subCpuB->rowMax(*cpuA);
-    subGpuB->rowMax(*gpuA);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixColSum(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  if (!endCol) {
-    cpuA->accumulateColSum(*cpuB);
-    gpuA->accumulateColSum(*gpuB);
-  } else {
-    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
-    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    subCpuA->accumulateColSum(*subCpuB);
-    subGpuA->accumulateColSum(*subGpuB);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixColMax(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  if (!endCol) {
-    cpuB->colMax(*cpuA);
-    gpuB->colMax(*gpuA);
-  } else {
-    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
-    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    subCpuB->colMax(*subCpuA);
-    subGpuB->colMax(*subGpuA);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixCollectBias(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  real scale = 1.0f / (rand() % 10);  // NOLINT
-
-  cpuA->collectBias(*cpuB, scale);
-  gpuA->collectBias(*gpuB, scale);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixSumOfSquares(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  if (!endCol) {
-    cpuA->sumOfSquares(*cpuB, *cpuC);
-    gpuA->sumOfSquares(*gpuB, *gpuC);
-  } else {
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subCpuC = cpuC->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuC = gpuC->subColMatrix(0, endCol);
-    cpuA->sumOfSquares(*subCpuB, *subCpuC);
-    gpuA->sumOfSquares(*subGpuB, *subGpuC);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixBinaryClassificationError(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA2 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB2 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC2 = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA2->copyFrom(*cpuA);
-  cpuB2->copyFrom(*cpuB);
-  cpuC2->copyFrom(*cpuC);
-
-  real scale = 0.5;
-  int columnOffset = rand() % width;  // NOLINT
-
-  cpuA->binaryClassificationError(columnOffset, *cpuB, *cpuC, scale);
-  gpuA->binaryClassificationError(columnOffset, *gpuB, *gpuC, scale);
-  cpuA2->binaryClassificationError2(columnOffset, *cpuB2, *cpuC2, scale);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-  MatrixCheckErr(*cpuA, *cpuA2);
-}
-
-TEST(Matrix, aggregate) {
-  for (auto height : {1, 11, 16, 32, 64, 73, 128, 200, 1024, 2345}) {
-    for (auto width : {1, 9, 16, 32, 64, 100, 512, 1000, 1024, 2453}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      testMatrixRowSum(height, width);
-      testMatrixRowMax(height, width);
-      testMatrixColSum(height, width);
-      testMatrixColMax(height, width);
-      testMatrixCollectBias(height, width);
-      testMatrixTernaryRowDotMul(height, width);
-      testMatrixAddDotMulVMM(height, width);
-
-      testMatrixSumOfSquares(height, width);
-      testMatrixBinaryClassificationError(height, width);
-    }
-  }
+  cpu->setDiag(1.0);
+  TensorCheckErr(*cpu, *outputCheck);
 }

-TEST(Matrix, aggregate2) {
-  for (auto height : {16, 32, 128, 512, 1024}) {
-    for (auto width :
-         {16, 32, 64, 128, 256, 512, 768, 1024, 2048, 3072, 4096}) {
+TEST(Matrix, unary) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
      VLOG(3) << " height=" << height << " width=" << width;

-      int endCol = rand() % width;  // NOLINT
-      testMatrixRowMax(height, width, endCol);
-      testMatrixSumOfSquares(height, width, endCol);
-      testMatrixColSum(height, width, endCol);
-      testMatrixColMax(height, width, endCol);
-      testMatrixAddDotMulVMM(height, width, endCol);
+      testMatrixDeepSwap(height, width);
+      testMatrixZeroAtOffset(height, width);
+      testMatrixGetSum(height, width);
+      testMatrixTranspose(height, width);
    }
+    // inverse
+    testMatrixInverse(height);
  }
 }

-void testMatrixAddAtOffset(int height, int width1, int width2) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width2);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width1);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width2);
+void testMatrixSoftmax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);

  cpuInput->randomizeUniform();
  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  int columnOffset = 0;
-  int offset = std::abs(width1 - width2);
-  if (offset) {
-    columnOffset = rand() % offset;  // NOLINT
-  }
-  cpuOutput->addAtOffset(*cpuInput, columnOffset);
-  gpuOutput->addAtOffset(*gpuInput, columnOffset);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);

-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width2);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
 }

-void testMatrixAssignAtOffset(int height, int width1, int width2) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width2);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width1);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width2);
-
+void testSequenceSoftmax(int batchSize) {
+  // forward
+  int inputDim = 1;
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
  cpuInput->randomizeUniform();
  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);

-  int columnOffset = 0;
-  int offset = std::abs(width1 - width2);
-  if (offset) {
-    columnOffset = rand() % offset;  // NOLINT
-  }
-  cpuOutput->assignAtOffset(*cpuInput, columnOffset);
-  gpuOutput->assignAtOffset(*gpuInput, columnOffset);
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);

-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width2);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
+  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
+
+  TensorCheckErr(*cpuInput, *gpuInput);
 }

-TEST(Matrix, AtOffset) {
-  for (auto height : {1, 11, 73, 128, 200}) {
-    for (auto width1 : {1, 32, 100, 512, 1000}) {
-      for (auto width2 : {1, 32, 100, 512, 1000}) {
-        VLOG(3) << " height=" << height << " width1=" << width1
-                << " width2=" << width2;
+void testMatrixSoftmaxThreshold(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  cpuInput->getData()[0] = 100.0;
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);

-        testMatrixAddAtOffset(height, width1, width2);
-        testMatrixAssignAtOffset(height, width1, width2);
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuOutput);
+  // check output zero
+  int cpuCount = 0;
+  int gpuCount = 0;
+  auto zeroNum = [](MatrixPtr out, int& count) {
+    for (size_t i = 0; i < out->getHeight(); i++) {
+      for (size_t j = 0; j < out->getWidth(); j++) {
+        if (out->getElement(i, j) == 0) count++;
      }
    }
-  }
+  };
+  zeroNum(cpuOutput, cpuCount);
+  zeroNum(outputCheck, gpuCount);
+  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
+  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
 }

-void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
-  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
-  cpuTable->randomizeUniform();
-  gpuTable->copyFrom(*cpuTable);
-
-  IVectorPtr cpuIds;
-  IVectorPtr gpuIds;
-  cpuIds = VectorT<int>::create(numSamples, false);
-  gpuIds = VectorT<int>::create(numSamples, true);
-  cpuIds->rand(tableSize);
-  gpuIds->copyFrom(*cpuIds);
+void testMatrixSoftmaxBp(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);

-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
  cpuOutput->randomizeUniform();
  gpuOutput->copyFrom(*cpuOutput);
+  gpuOutput->softmaxBackward(*gpuInput);

-  cpuOutput->selectRows(*cpuTable, *cpuIds);
-  gpuOutput->selectRows(*gpuTable, *gpuIds);
+  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
+  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
+  sftMaxSum->colMerge(*sftMaxDot);
+  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);

-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+}
+
+TEST(Matrix, softmax) {
+  for (auto height : {1, 11, 73, 128, 200}) {
+    for (auto width : {1, 32, 100, 512, 1000}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      testMatrixSoftmax(height, width);
+      testMatrixSoftmaxBp(height, width);
+      testMatrixSoftmaxThreshold(height, width);
+    }
+    testSequenceSoftmax(height);
+  }
 }

 void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
@@ -1342,10 +470,7 @@ void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
  cpuOutput->addToRows(*cpuTable, *cpuIds);
  gpuOutput->addToRows(*gpuTable, *gpuIds);

-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  outputCheck->copyFrom(*gpuTable);
-  MatrixCheckErr(*cpuTable, *outputCheck);
+  TensorCheckErr(*cpuTable, *gpuTable);
 }

 TEST(Matrix, tableProjection) {
@@ -1354,7 +479,6 @@ TEST(Matrix, tableProjection) {
      for (auto inputDim : {20, 50}) {
        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
                << " inputDim=" << inputDim;
-        testMatrixSelectRows(numSamples, tableSize, inputDim);
        testMatrixAddToRows(numSamples, tableSize, inputDim);
      }
    }
@@ -1388,9 +512,7 @@ void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
  cpuC->mul(cpuA, cpuB, alpha, beta);
  gpuC->mul(gpuA, gpuB, alpha, beta);

-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(heightC, widthC);
-  outputCheck->copyFrom(*gpuC);
-  MatrixCheckErr(*cpuC, *outputCheck);
+  TensorCheckErr(*cpuC, *gpuC);
 }

 void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
@@ -1462,9 +584,7 @@ void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
  subCpuC->mul(subCpuA, subCpuB, alpha, beta);
  subGpuC->mul(subGpuA, subGpuB, alpha, beta);

-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(heightC, widthC);
-  outputCheck->copyFrom(*gpuC);
-  MatrixCheckErr(*cpuC, *outputCheck);
+  TensorCheckErr(*cpuC, *gpuC);
 }

 TEST(Matrix, mul) {
@@ -1518,9 +638,7 @@ void testVectorReset(int size) {
  cpu->reset(value);
  gpu->reset(value);

-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpu);
-  VectorCheckEqual(*cpu, *out);
+  TensorCheckEqual(*cpu, *gpu);
 }

 template <class T>
@@ -1546,9 +664,7 @@ void testVecortSelectFrom(int size) {
  cpuDst->selectFrom(*cpuSrc, *cpuIds);
  gpuDst->selectFrom(*gpuSrc, *gpuIds);

-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpuDst);
-  VectorCheckEqual(*cpuDst, *out);
+  TensorCheckEqual(*cpuDst, *gpuDst);
 }

 template <class T>
@@ -1559,9 +675,7 @@ void testVecotrZeroMem(int size) {
  cpu->zeroMem();
  gpu->zeroMem();

-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpu);
-  VectorCheckEqual(*cpu, *out);
+  TensorCheckEqual(*cpu, *gpu);
 }

 template <class T>
@@ -1582,9 +696,7 @@ void testVectorIsEqual(int size) {
  cpuA->isEqualTo(*cpuB, value);
  gpuA->isEqualTo(*gpuB, value);

-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpuA);
-  VectorCheckEqual(*cpuA, *out);
+  TensorCheckEqual(*cpuA, *gpuA);
 }

 TEST(Vector, Equal) {
@@ -1615,9 +727,7 @@ void testMatrixTopK(int samples, int dim, int beamSize) {
  cpuSrc->rowMax(*cpuIds, *cpuVal);
  gpuSrc->rowMax(*gpuIds, *gpuVal);

-  MatrixPtr outVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  outVal->copyFrom(*gpuVal);
-  MatrixCheckEqual(*cpuVal, *outVal);
+  TensorCheckEqual(*cpuVal, *gpuVal);
 }

 TEST(Matrix, topK) {
@@ -1653,9 +763,7 @@ void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
  cpuSrc->rowMax(*cpuIds, *cpuVal);
  gpuSrc->rowMax(*gpuIds, *gpuVal);

-  MatrixPtr outCheckMaxVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  outCheckMaxVal->copyFrom(*gpuVal);
-  MatrixCheckEqual(*cpuVal, *outCheckMaxVal);
+  TensorCheckEqual(*cpuVal, *gpuVal);

  IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
  outCheckIds->copyFrom(*gpuIds);
@@ -1685,42 +793,6 @@ TEST(SMatrix, topK) {
  }
 }

-void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(inHeight, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(inHeight, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(outHeight, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(outHeight, width);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  IVectorPtr cpuRowIndex = IVector::create(outHeight, false);
-  IVectorPtr gpuRowIndex = IVector::create(outHeight, true);
-  cpuRowIndex->rand(inHeight);
-  gpuRowIndex->copyFrom(*cpuRowIndex);
-
-  cpuOutput->copyByRowIndex(*cpuInput, *cpuRowIndex);
-  gpuOutput->copyByRowIndex(*gpuInput, *gpuRowIndex);
-
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(outHeight, width);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
-}
-
-TEST(Matrix, copyByRowIndex) {
-  for (auto outHeight : {31, 500, 1000}) {
-    for (auto inHeight : {17, 257, 500, 1200}) {
-      for (auto width : {512, 1024}) {
-        VLOG(3) << outHeight << " " << inHeight << " " << width;
-        testMatrixCopyByRowIndex(outHeight, inHeight, width);
-      }
-    }
-  }
-}
-
 void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
@@ -1741,10 +813,7 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
  cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
  gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);

-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckErr(*cpuOutput, *outputCheck);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
 }

 TEST(Matrix, sequenceAvgForward) {
@@ -1759,45 +828,6 @@ TEST(Matrix, sequenceAvgForward) {
  }
 }

-void testCosSim(int heightX, int heightY, int width, real scale) {
-  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
-  MatrixPtr output = CpuMatrix::create(heightX, 1, false, false);
-
-  prevOutX->randomizeUniform();
-  prevOutY->randomizeUniform();
-  prevOutX->add(-0.5);
-  prevOutY->add(-0.5);
-  output->randomizeUniform();
-
-  MatrixPtr prevOutXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevOutYGpu = GpuMatrix::create(heightY, width, false, true);
-  MatrixPtr outputGpu = GpuMatrix::create(heightX, 1, false, true);
-
-  prevOutXGpu->copyFrom(*prevOutX);
-  prevOutYGpu->copyFrom(*prevOutY);
-  outputGpu->copyFrom(*output);
-
-  output->cosSim(*prevOutX, *prevOutY, scale);
-  outputGpu->cosSim(*prevOutXGpu, *prevOutYGpu, scale);
-
-  MatrixPtr outputCheck = CpuMatrix::create(heightX, 1, false, false);
-  outputCheck->copyFrom(*outputGpu);
-  MatrixCheckErr(*output, *outputCheck);
-}
-
-TEST(Matrix, cosSim) {
-  for (auto heightX : {10, 100, 1000}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSim(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
@@ -1837,12 +867,8 @@ void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
                            *prevGradYGpu,
                            scale);

-  MatrixPtr prevGradXCheck = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevGradYCheck = CpuMatrix::create(heightY, width, false, false);
-  prevGradXCheck->copyFrom(*prevGradXGpu);
-  prevGradYCheck->copyFrom(*prevGradYGpu);
-  MatrixCheckErr(*prevGradX, *prevGradXCheck);
-  MatrixCheckErr(*prevGradY, *prevGradYCheck);
+  TensorCheckErr(*prevGradX, *prevGradXGpu);
+  TensorCheckErr(*prevGradY, *prevGradYGpu);
 }

 TEST(Matrix, cosSimDerivate) {
@@ -1857,80 +883,6 @@ TEST(Matrix, cosSimDerivate) {
  }
 }

-void testParamReluForward(int height, int width, int w_height, int w_width) {
-  MatrixPtr output = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  output->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr outputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-
-  output->paramReluForward(*input, *w);
-  outputGpu->paramReluForward(*inputGpu, *wGpu);
-
-  MatrixPtr outputCheck = CpuMatrix::create(height, width, false, false);
-  outputCheck->copyFrom(*outputGpu);
-  MatrixCheckEqual(*output, *outputCheck);
-}
-
-TEST(Matrix, paramReluForward) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          testParamReluForward(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
-  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  oGrad->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  oGradGpu->copyFrom(*oGrad);
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-
-  w->paramReluBackwardW(*oGrad, *input);
-  wGpu->paramReluBackwardW(*oGradGpu, *inputGpu);
-  MatrixPtr wCheck = CpuMatrix::create(w_height, w_width, false, false);
-  wCheck->copyFrom(*wGpu);
-  MatrixCheckErr(*w, *wCheck);
-}
-
-TEST(Matrix, paramReluBackwardW) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          testParamReluBackwardW(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluBackwardDiff(int height,
                               int width,
                               int w_height,
@@ -1959,9 +911,7 @@ void testParamReluBackwardDiff(int height,
  diff->paramReluBackwardDiff(*oGrad, *input, *w);
  diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);

-  MatrixPtr diffCheck = CpuMatrix::create(height, width, false, false);
-  diffCheck->copyFrom(*diffGpu);
-  MatrixCheckErr(*diff, *diffCheck);
+  TensorCheckErr(*diff, *diffGpu);
 }

 TEST(Matrix, paramReluBackwardDiff) {
@@ -1992,9 +942,7 @@ void testClassificationError(int numSamples, int dim) {
  cpuError->classificationError(cpuOutput, cpuLabel);
  gpuError->classificationError(gpuOutput, gpuLabel);

-  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, 1);
-  check->copyFrom(*gpuError);
-  MatrixCheckEqual(*cpuError, *check);
+  TensorCheckEqual(*cpuError, *gpuError);
 }

 TEST(Matrix, classificationError) {
@@ -2159,9 +1107,8 @@ void testAvgPoolFwdBwd(int numSamples,
                            outW,
                            padH,
                            padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
+
+  TensorCheckErr(*target, *targetGpu);

  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
@@ -2200,10 +1147,8 @@ void testAvgPoolFwdBwd(int numSamples,
                                1.0,
                                padH,
                                padW);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetBwdCheck);
+
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
 }

 TEST(Matrix, PoolFwdBwd) {
@@ -2268,11 +1213,9 @@ void testMaxOutFwdBwd(

  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);

  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
-  IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false);

  input->randomizeUniform();
  inputGpu->copyFrom(*input);
@@ -2280,11 +1223,8 @@ void testMaxOutFwdBwd(
  target->maxoutForward(*input, *id, outChannels, groups);
  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);

-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-  idCheck->copyFrom(*idGpu);
-  VectorCheckEqual(*id, *idCheck);
+  TensorCheckErr(*target, *targetGpu);
+  TensorCheckEqual(*id, *idGpu);

  // backward
  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
@@ -2293,8 +1233,6 @@ void testMaxOutFwdBwd(
  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
  MatrixPtr targetGpuGrad =
      GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);

  inputGrad->randomizeUniform();
  targetGrad->randomizeUniform();
@@ -2304,9 +1242,7 @@ void testMaxOutFwdBwd(
  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);

-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
 }

 TEST(Matrix, MaxOutFwdBwd) {
@@ -2326,113 +1262,6 @@ TEST(Matrix, MaxOutFwdBwd) {
  }
 }

-void testAddSharedBias(int numSamples, int dim, int channel) {
-  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
-  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
-
-  cpuData->randomizeUniform();
-  gpuData->copyFrom(*cpuData);
-  cpuBias->randomizeUniform();
-  gpuBias->copyFrom(*cpuBias);
-
-  cpuData->addSharedBias(*cpuBias, 1.0);
-  gpuData->addSharedBias(*gpuBias, 1.0);
-
-  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, dim);
-  check->copyFrom(*gpuData);
-  MatrixCheckErr(*cpuData, *check);
-}
-
-void testCollectSharedBias(int numSamples, int dim, int channel) {
-  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
-  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
-
-  cpuData->randomizeUniform();
-  gpuData->copyFrom(*cpuData);
-  cpuBias->randomizeUniform();
-  gpuBias->copyFrom(*cpuBias);
-
-  cpuBias->collectSharedBias(*cpuData, 1.0);
-  gpuBias->collectSharedBias(*gpuData, 1.0);
-
-  MatrixPtr check = std::make_shared<CpuMatrix>(1, channel);
-  check->copyFrom(*gpuBias);
-  MatrixCheckErr(*cpuBias, *check);
-}
-
-TEST(Matrix, sharedBias) {
-  for (auto numSamples : {1, 100, 520}) {
-    for (auto dim : {100 * 16, 100 * 32}) {
-      for (auto channel : {8, 16}) {
-        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
-                << " channel=" << channel;
-        testAddSharedBias(numSamples, dim, channel);
-        testCollectSharedBias(numSamples, dim, channel);
-      }
-    }
-  }
-}
-
-void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
-  MatrixPtr output = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuEntropy = std::make_shared<CpuMatrix>(numSamples, 1);
-  MatrixPtr gpuEntropy = std::make_shared<GpuMatrix>(numSamples, 1);
-
-  MatrixPtr cpuGrad = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuGrad = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuLabel = std::make_shared<CpuSparseMatrix>(
-      numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
-  MatrixPtr gpuLabel = std::make_shared<GpuSparseMatrix>(
-      numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
-  for (int i = 0; i < numSamples; i++) {
-    const unsigned int id = rand() % dim;  // NOLINT
-    cpuLabel->setRow(i, 1, &id, nullptr);
-    gpuLabel->setRow(i, 1, &id, nullptr);
-  }
-
-  output->randomizeUniform();
-  cpuOutput->zeroMem();
-  output->softmax(*cpuOutput);
-  gpuOutput->copyFrom(*cpuOutput);
-
-  cpuEntropy->zeroMem();
-  gpuEntropy->zeroMem();
-  cpuEntropy->multiBinaryLabelCrossEntropy(*cpuOutput, *cpuLabel);
-  gpuEntropy->multiBinaryLabelCrossEntropy(*gpuOutput, *gpuLabel);
-
-  MatrixPtr check1 = std::make_shared<CpuMatrix>(numSamples, 1);
-  check1->copyFrom(*gpuEntropy);
-  MatrixCheckErr(*cpuEntropy, *check1);
-
-  cpuGrad->zeroMem();
-  gpuGrad->zeroMem();
-  cpuGrad->multiBinaryLabelCrossEntropyBp(*cpuOutput, *cpuLabel);
-  gpuGrad->multiBinaryLabelCrossEntropyBp(*gpuOutput, *gpuLabel);
-
-  MatrixPtr check2 = std::make_shared<CpuMatrix>(numSamples, dim);
-  check2->copyFrom(*gpuGrad);
-  MatrixCheckErr(*cpuGrad, *check2);
-}
-
-TEST(Matrix, multiBinaryCrossEntropy) {
-  for (auto numSamples : {100, 1000, 10000}) {
-    for (auto dim : {100, 1000, 10000}) {
-      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
-      testMultiBinaryLabelCrossEntropy(numSamples, dim);
-    }
-  }
-}
-
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -225,6 +225,8 @@ void Argument::resizeAndCopyFrom(const Argument& src,
  }
  resizeAndCopy(udp, src.udp, useGpu, stream);
  resizeAndCopy(strs, src.strs, useGpu, stream);
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
 }

 int32_t Argument::resizeAndCopyFrom(const Argument& src,

--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -559,10 +559,10 @@ def __monkey_patch_trainer__():


 def monkeypatches():
-    patches = [
-        __monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
-        __monkey_patch_protobuf_objects__, __monkey_patch_parameter__,
-        __monkey_patch_trainer__
-    ]
+    patches = [__monkeypatch_init_paddle__,
+               __monkeypatch_gradient_machine__,
+               __monkey_patch_protobuf_objects__,
+               __monkey_patch_parameter__,
+               __monkey_patch_trainer__]
    for patch in patches:
        patch()
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
+FROM ubuntu:14.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+RUN apt-get update && \
+    apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
+    python-protobuf python-numpy python-dev swig openssh-server \
+    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
+    sed grep graphviz libjpeg-dev zlib1g-dev doxygen && \
+    apt-get clean -y
+RUN pip install BeautifulSoup docopt PyYAML pillow \
+    'sphinx>=1.4.0' sphinx_rtd_theme breathe recommonmark
+
+ARG WITH_AVX
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV WITH_GPU=OFF
+
+RUN mkdir /paddle
+COPY . /paddle/
+RUN /paddle/paddle/scripts/docker/build.sh
+
+RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
+RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
+RUN paddle version  # print version after build
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
--- a/paddle/scripts/docker/Dockerfile.cpu
+++ b/paddle/scripts/docker/Dockerfile.cpu
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/Dockerfile.cpu-demo
+++ b/paddle/scripts/docker/Dockerfile.cpu-demo
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/Dockerfile.cpu-devel
+++ b/paddle/scripts/docker/Dockerfile.cpu-devel
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+RUN apt-get update && \
+    apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
+    python-protobuf python-numpy python-dev swig openssh-server \
+    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
+    sed grep graphviz libjpeg-dev zlib1g-dev doxygen && \
+    apt-get clean -y
+RUN pip install BeautifulSoup docopt PyYAML pillow \
+    'sphinx>=1.4.0' sphinx_rtd_theme breathe recommonmark
+
+ARG WITH_AVX
+ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_GPU=ON
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
+
+RUN mkdir /paddle
+COPY . /paddle/
+RUN /paddle/paddle/scripts/docker/build.sh
+
+RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
+RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
+RUN paddle version  # print version after build
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
--- a/paddle/scripts/docker/Dockerfile.gpu-demo
+++ b/paddle/scripts/docker/Dockerfile.gpu-demo
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/Dockerfile.gpu-devel
+++ b/paddle/scripts/docker/Dockerfile.gpu-devel
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/Dockerfile.m4
+++ b/paddle/scripts/docker/Dockerfile.m4
-FROM PADDLE_BASE_IMAGE
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0
-ENV WITH_GPU=PADDLE_WITH_GPU
-ENV IS_DEVEL=PADDLE_IS_DEVEL
-ENV WITH_DEMO=PADDLE_WITH_DEMO
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=PADDLE_WITH_AVX
-RUN cd /root/ && bash build.sh
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -7,43 +7,21 @@ function abort(){

 trap 'abort' 0
 set -e
-if [ ${USE_UBUNTU_MIRROR} == "ON" ]; then
-    sed -i 's#http://archive\.ubuntu\.com/ubuntu/#mirror://mirrors\.ubuntu\.com/mirrors\.txt#g'\
-      /etc/apt/sources.list
-fi
-apt-get update
-apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip\
-    python-protobuf python-numpy python-dev swig

 if [ ${WITH_GPU} == 'ON' ]; then
  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
 fi

-cd ~
-git clone https://github.com/baidu/Paddle.git paddle
-cd paddle
-git checkout ${GIT_CHECKOUT}
-mkdir build
-cd build
-cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON\
-   -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=OFF -DWITH_AVX=${WITH_AVX}
+mkdir -p /paddle/build # -p means no error if exists
+cd /paddle/build
+cmake .. \
+      -DWITH_DOC=ON \
+      -DWITH_GPU=${WITH_GPU} \
+      -DWITH_AVX=${WITH_AVX} \
+      -DWITH_SWIG_PY=ON \
+      -DCUDNN_ROOT=/usr/ \
+      -DWITH_STYLE_CHECK=OFF
 make -j `nproc`
-# because durning make install, there are several warning, so set +e, do not cause abort
 make install
-echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS} /usr/local/opt/paddle/share/wheels/*.whl
-paddle version  # print version after build

-if [ ${WITH_DEMO} == "ON" ]; then
-  apt-get install -y wget unzip perl python-matplotlib tar xz-utils bzip2 gzip coreutils\
-	          sed grep graphviz libjpeg-dev zlib1g-dev
-  pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS}  BeautifulSoup docopt \
-    PyYAML pillow
-fi
-if [ ${IS_DEVEL} == "OFF" ]; then  # clean build packages.
-  cd ~
-  rm -rf paddle
-fi
-apt-get clean -y
 trap : 0
--- a/paddle/scripts/docker/generate.sh
+++ b/paddle/scripts/docker/generate.sh
-#!/bin/bash
-set -e
-cd `dirname $0`
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
-   Dockerfile.m4 > Dockerfile.cpu
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
-   Dockerfile.m4 > Dockerfile.cpu-noavx
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
-   Dockerfile.m4 > Dockerfile.cpu-noavx-devel
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
-   Dockerfile.m4 > Dockerfile.cpu-devel
-
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
-   Dockerfile.m4 > Dockerfile.cpu-demo
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
-   Dockerfile.m4 > Dockerfile.cpu-noavx-demo
-
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=ON \
-   Dockerfile.m4 > Dockerfile.gpu
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=OFF \
-   Dockerfile.m4 > Dockerfile.gpu-noavx
-
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=ON \
-   Dockerfile.m4 > Dockerfile.gpu-devel
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=OFF \
-   Dockerfile.m4 > Dockerfile.gpu-noavx-devel
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=ON \
-   Dockerfile.m4 > Dockerfile.gpu-demo
-
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=OFF \
-   Dockerfile.m4 > Dockerfile.gpu-noavx-demo
--- a/paddle/scripts/tools/build_docs/Dockerfile
+++ b/paddle/scripts/tools/build_docs/Dockerfile
 FROM paddledev/paddle:cpu-devel-latest
 COPY build.sh /
 RUN pip install sphinx &&\
+    pip install sphinx_rtd_theme &&\
    apt install -y doxygen graphviz &&\
    pip install breathe recommonmark numpy protobuf==2.6.1
 CMD /build.sh
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -87,10 +87,8 @@ void Tester::testOneDataBatch(const DataBatch& dataBatch,
 void Tester::testOnePeriod() {
  DataBatch dataBatch;
  int64_t batchSize = config_->getOptConfig().batch_size();
-  bool testAllData =
-      intconfig_->testPeriod == 0 || intconfig_->testAllDataInOnePeriod;
-  int batches =
-      testAllData ? std::numeric_limits<int>::max() : intconfig_->testPeriod;
+
+  int batches = std::numeric_limits<int>::max();

  std::vector<Argument> outArgs;

@@ -102,11 +100,7 @@ void Tester::testOnePeriod() {
      if (intconfig_->prevBatchState) {
        gradientMachine_->resetState();
      }
-      if (testAllData) {
-        break;
-      } else {
-        num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
-      }
+      break;
    }
    testOneDataBatch(dataBatch, &outArgs);
  }

--- a/paddle/trainer/TesterConfig.h
+++ b/paddle/trainer/TesterConfig.h
@@ -39,11 +39,6 @@ struct TesterConfig {
   */
  int testPeriod;

-  /**
-   * indicate whether testing data in one period
-   */
-  bool testAllDataInOnePeriod;
-
  /**
   * indicate whether to save previous batch state
   */

--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -39,20 +39,16 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"

 P_DEFINE_string(config, "", "Trainer config file");
-P_DEFINE_int32(test_period,
-               0,
-               "Run test every so many train batches."
-               " 0 for testing after each pass."
-               " If not 0, test log_period batches."
-               " If 0, test on all test data");

-P_DEFINE_bool(local, true, "Train in local mode or not");
+P_DEFINE_int32(test_period, 0,
+               "if equal 0, do test on all test data at the end of "
+               "each pass. While if equal non-zero, do test on all test "
+               "data every test_period batches");
+P_DEFINE_bool(test_all_data_in_one_period, false,
+               "This option was deprecated, since we will always do "
+               "test on all test set ");

-P_DEFINE_bool(
-    test_all_data_in_one_period,
-    false,
-    "true will test all data in one test peroid."
-    "Otherwise test (batch_size * log_peroid) data in one test period.");
+P_DEFINE_bool(local, true, "Train in local mode or not");

 P_DEFINE_int32(average_test_period,
               0,
@@ -205,7 +201,7 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
      (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count));

  dataProvider_ = dataProvider;
-  if (!dataProvider_ && config_->hasDataConfig()) {
+  if (!dataProvider_ && config_->hasDataConfig() && !testing_) {
    dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
  }
  if (!testDataProvider_) {
@@ -633,8 +629,19 @@ void Trainer::test() { tester_->test(); }

 std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
  TesterConfig* conf = new TesterConfig;
+  if (FLAGS_test_period) {
+    LOG(WARNING)
+      << "The meaning of --test_period is changed: "
+      << "if equal 0, do test on all test data at the end of "
+      << "each pass. While if equal non-zero, do test on all test "
+      << "data every test_period batches ";
+  }
+  if (FLAGS_test_all_data_in_one_period) {
+    LOG(WARNING)
+      << "--test_all_data_in_one_period was deprecated, since "
+      << "we will always do test on all test set ";
+  }
  conf->testPeriod = FLAGS_test_period;
-  conf->testAllDataInOnePeriod = FLAGS_test_all_data_in_one_period;
  conf->prevBatchState = FLAGS_prev_batch_state;
  conf->logPeriod = FLAGS_log_period;
  conf->loadsaveParametersInPserver = FLAGS_loadsave_parameters_in_pserver;

--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
@@ -59,7 +59,6 @@ pool = img_pool_layer(input=fc2,
                      padding_y=2,
                      stride=2,
                      stride_y=3,
-                      img_width=3,
                      pool_type=CudnnAvgPooling())

 concat = concat_layer(input=[fc3, fc4])

--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
-execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
-	OUTPUT_VARIABLE PROTOBUF_VERSION)
-string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
-
-set(PROTOBUF_3 OFF)
-if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
-    set(PROTOBUF_3 ON)
-endif()
-
 set(proto_filenames
    DataConfig.proto
    DataFormat.proto

--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -77,6 +77,12 @@ message ConvConfig {
  required uint32 filter_size_y = 10;
  required uint32 padding_y = 11;
  required uint32 stride_y = 12;
+
+  // if not set, use output_x
+  optional uint32 output_y = 13;
+
+  // if not set, use img_size
+  optional uint32 img_size_y = 14;
 }

 message PoolConfig {
@@ -122,11 +128,9 @@ message PoolConfig {
 }

 message SppConfig {
-  required string pool_type = 1;
-  required uint32 pyramid_height = 2;
-  required uint32 channels = 3;
-  required uint32 img_size = 4;
-  optional uint32 img_size_y = 5;
+  required ImageConfig image_conf = 1;
+  required string pool_type = 2;
+  required uint32 pyramid_height = 3;
 }

 message NormConfig {
@@ -156,6 +160,12 @@ message NormConfig {
  // fixed window: shared a fixed window for each value
  // sliding window: have a different window for each value
  optional bool blocked = 8;
+
+  // if not set, use output_x
+  optional uint32 output_y = 9;
+
+  // if not set, use img_size
+  optional uint32 img_size_y = 10;
 }

 message BlockExpandConfig {
@@ -180,12 +190,8 @@ message BlockExpandConfig {
 }

 message MaxOutConfig {
-  required uint32 channels = 1;
+  required ImageConfig image_conf = 1;
  required uint32 groups = 2;
-
-  // The size of input feature map.
-  required uint32 img_size_x = 3;
-  required uint32 img_size_y = 4;
 }

 message ProjectionConfig {
@@ -226,12 +232,10 @@ message OperatorConfig {

 message BilinearInterpConfig {
  // The size of input feature map.
-  optional uint32 img_size_x = 1;
-  optional uint32 img_size_y = 2;
+  required ImageConfig image_conf = 1;
  // The size of output feature map.
-  required uint32 out_size_x = 3;
-  required uint32 out_size_y = 4;
-  required uint32 num_channels = 5;
+  required uint32 out_size_x = 2;
+  required uint32 out_size_y = 3;
 }

 message ImageConfig {
@@ -241,6 +245,7 @@ message ImageConfig {

  // The size of input feature map.
  required uint32 img_size = 8;
+  required uint32 img_size_y = 9;
 }

 message LayerInputConfig {
@@ -413,7 +418,10 @@ sinclude(`ModelConfigLayer.proto.m4')
  // string type is used for flexibility: different types can be converted
  // to string and reinterpreted in the user's own layer implementation.  
  optional string user_arg = 49;
-
+  
+  // to indicate rectangle image data
+  optional uint64 height = 50;
+  optional uint64 width = 51;
 }

 message EvaluatorConfig {

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -138,7 +138,14 @@ def init_config_environment(
        g_root_submodel=None,
        g_submodel_map={},
        g_submodel_stack=[],
-        g_add_submodel_suffix=False, ):
+        g_add_submodel_suffix=False,
+
+        # Whether current layer needs to pass the image height and width.
+        # Default value is true, but if it encounters recurrent_layer_group, 
+        # it will be false. The reason is that image is converted to be sequence, 
+        # image height will be sequence length, and image width will be feature 
+        # length of each timestep.
+        g_pass_height_width=True, ):

    for k, v in locals().iteritems():
        globals()[k] = copy.deepcopy(v)
@@ -686,9 +693,9 @@ class ConvProjection(Projection):

        parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf,
                   num_filters)
-        # TODO: support rectangle input
-        self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x
-                                      **2) * num_filters
+        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
+                                     self.proj_conf.conv_conf.output_y * \
+                                     num_filters

    def calc_output_size(self, input_layer_config):
        return self.proj_conf.output_size
@@ -698,7 +705,8 @@ class ConvProjection(Projection):
        ci = self.proj_conf.conv_conf.channels
        fh = self.proj_conf.conv_conf.filter_size
        fw = self.proj_conf.conv_conf.filter_size_y
-        return co * ci * fh * fw
+        gr = self.proj_conf.conv_conf.groups
+        return co * ci * fh * fw / gr

    def calc_bias_size(self):
        return self.proj_conf.num_filters
@@ -763,8 +771,9 @@ class ConvOperator(Operator):
        parse_conv(conv_conf,
                   MakeLayerNameInSubmodel(input_layer_names[0]),
                   self.operator_conf.conv_conf, num_filters)
-        self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x
-                                          **2) * num_filters
+        self.operator_conf.output_size = self.operator_conf.conv_conf.output_x * \
+                                         self.operator_conf.conv_conf.output_y * \
+                                         num_filters

        config_assert(len(input_layer_names) == 2, "Conv is binary operator")

@@ -799,14 +808,12 @@ class Conv(Cfg):
            config_assert(output_x <= 0)


-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class BilinearInterp(Cfg):
-    def __init__(self, out_size_x=None, out_size_y=None, num_channels=None):
+    def __init__(self, out_size_x=None, out_size_y=None, channels=None):
        self.add_keys(locals())


-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Pool(Cfg):
    def __init__(
@@ -824,14 +831,12 @@ class Pool(Cfg):
        self.add_keys(locals())


-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class SpatialPyramidPool(Cfg):
-    def __init__(self, pool_type, pyramid_height, channels, img_width=None):
+    def __init__(self, pool_type, pyramid_height, channels):
        self.add_keys(locals())


-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Norm(Cfg):
    def __init__(self,
@@ -846,7 +851,6 @@ class Norm(Cfg):
        self.add_keys(locals())


-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Image(Cfg):
    def __init__(self, channels, img_size=None):
@@ -1053,18 +1057,8 @@ def TestData(data_config, async_load_data=None):
        g_config.test_data_config.async_load_data = async_load_data


-def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
-    bilinear_conf.out_size_x = bilinear.out_size_x
-    bilinear_conf.out_size_y = bilinear.out_size_y
-    bilinear_conf.num_channels = bilinear.num_channels
-
-
-'''
-caffe_mode: compute the output size using floor instead of ceil,
-            which is consistent of caffe and CuDNN's convention.
-'''
-
-
+#caffe_mode: compute the output size using floor instead of ceil,
+#            which is consistent of caffe and CuDNN's convention.
 def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
    output = (2 * padding + img_size - filter_size) / float(stride)
    if caffe_mode:
@@ -1073,20 +1067,34 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
        return 1 + int(math.ceil(output))


-'''
-calcualte image_size based on output_size for convolution. 
-It is the reverse function of cnn_output_size
-'''
-
-
+#calcualte image_size based on output_size for de-convolution (ConvTransLayer). 
+#It is the reverse function of cnn_output_size
 def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
-    if caffe_mode:
-        img_size = (output_size - 1) * stride + filter_size - 2 * padding
-    else:
-        img_size = (output_size - 2) * stride + filter_size - 2 * padding + 1
+    img_size = (output_size - 1) * stride + filter_size - 2 * padding
+    if not caffe_mode:
+        img_size = img_size + 1
    return img_size


+def get_img_size(input_layer_name, channels):
+    input = g_layer_map[input_layer_name]
+    img_pixels = input.size / channels
+    img_size = input.width if input.width > 0 else int(img_pixels**0.5)
+    img_size_y = input.height if input.height > 0 else int(img_pixels /
+                                                           img_size)
+    config_assert(
+        img_size * img_size_y == img_pixels,
+        "Input layer %s: Incorrect input image size %d * %d for input image pixels %d"
+        % (input_layer_name, img_size, img_size_y, img_pixels))
+    return img_size, img_size_y
+
+
+def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
+    parse_image(bilinear, input_layer_name, bilinear_conf.image_conf)
+    bilinear_conf.out_size_x = bilinear.out_size_x
+    bilinear_conf.out_size_y = bilinear.out_size_y
+
+
 def parse_pool(pool, input_layer_name, pool_conf):
    pool_conf.pool_type = pool.pool_type
    config_assert(pool.pool_type in [
@@ -1102,14 +1110,8 @@ def parse_pool(pool, input_layer_name, pool_conf):
    pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)

-    img_pixels = g_layer_map[input_layer_name].size / pool.channels
-    # the img_width may be removed,
-    # and it can be calculated automatically later.
-    pool_conf.img_size = default(pool.img_width, int(img_pixels**0.5))
-    pool_conf.img_size_y = img_pixels / pool_conf.img_size
-    config_assert(pool_conf.img_size * pool_conf.img_size_y == img_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (pool_conf.img_size, img_pixels))
+    pool_conf.img_size, pool_conf.img_size_y = \
+        get_img_size(input_layer_name, pool.channels)

    config_assert(not pool.start, "start is deprecated in pooling.")

@@ -1125,29 +1127,18 @@ def parse_pool(pool, input_layer_name, pool_conf):


 def parse_spp(spp, input_layer_name, spp_conf):
+    parse_image(spp, input_layer_name, spp_conf.image_conf)
    spp_conf.pool_type = spp.pool_type
    config_assert(spp.pool_type in ['max-projection', 'avg-projection'],
                  "pool-type %s is not in "
                  "['max-projection', 'avg-projection']" % spp.pool_type)
    spp_conf.pyramid_height = spp.pyramid_height
-    spp_conf.channels = spp.channels
-
-    img_pixels = g_layer_map[input_layer_name].size / spp_conf.channels
-
-    spp_conf.img_size = default(spp.img_width, int(img_pixels**0.5))
-    spp_conf.img_size_y = img_pixels / spp_conf.img_size
-    config_assert(spp_conf.img_size * spp_conf.img_size_y == img_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (spp_conf.img_size, img_pixels))


 def parse_image(image, input_layer_name, image_conf):
    image_conf.channels = image.channels
-    image_pixels = g_layer_map[input_layer_name].size / image_conf.channels
-    image_conf.img_size = int(image_pixels**0.5)
-    config_assert((image_conf.img_size**2) == image_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (image_conf.img_size, image_pixels))
+    image_conf.img_size, image_conf.img_size_y = \
+        get_img_size(input_layer_name, image_conf.channels)


 def parse_norm(norm, input_layer_name, norm_conf):
@@ -1161,24 +1152,18 @@ def parse_norm(norm, input_layer_name, norm_conf):
    norm_conf.pow = norm.pow
    norm_conf.blocked = norm.blocked

-    img_pixels = g_layer_map[input_layer_name].size / norm.channels
-    norm_conf.img_size = int(img_pixels**0.5)
-    config_assert((norm_conf.img_size**2) == img_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (norm_conf.img_size, img_pixels))
+    norm_conf.img_size, norm_conf.img_size_y = \
+        get_img_size(input_layer_name, norm.channels)
    norm_conf.output_x = norm_conf.img_size
+    norm_conf.output_y = norm_conf.img_size_y
    if norm.norm_type in ['cmrnorm-projection']:
        norm_conf.scale /= norm.size
    else:
        norm_conf.scale /= norm.size**2


-'''
-caffe_mode: compute the output size using floor instead of ceil,
-            which is consistent of caffe and CuDNN's convention.
-'''
-
-
+#caffe_mode: compute the output size using floor instead of ceil,
+#            which is consistent of caffe and CuDNN's convention.
 def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
    conv_conf.filter_size = conv.filter_size
    conv_conf.filter_size_y = conv.filter_size_y
@@ -1192,33 +1177,24 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):

    if not trans:
        conv_conf.filter_channels = conv.channels / conv.groups
-
-        img_pixels = g_layer_map[input_layer_name].size / conv.channels
-        print('channels=%d size=%d' % (conv.channels,
-                                       g_layer_map[input_layer_name].size))
-        conv_conf.img_size = int(img_pixels**0.5)
-        config_assert((conv_conf.img_size**2) == img_pixels, (
-            "Input layer %s: Incorrect input image size %d for input " +
-            "image pixels %d") %
-                      (input_layer_name, conv_conf.img_size, img_pixels))
-
+        conv_conf.img_size, conv_conf.img_size_y = \
+            get_img_size(input_layer_name, conv.channels)
        conv_conf.output_x = cnn_output_size(
            conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
            conv_conf.stride, conv_conf.caffe_mode)
+        conv_conf.output_y = cnn_output_size(
+            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
+            conv_conf.stride_y, conv_conf.caffe_mode)
    else:
        conv_conf.filter_channels = num_filters / conv.groups
-
-        outputSize = g_layer_map[input_layer_name].size / conv.channels
-        print('channels=%d size=%d' % (conv.channels,
-                                       g_layer_map[input_layer_name].size))
-        conv_conf.output_x = int(outputSize**0.5)
-        config_assert((conv_conf.output_x**2) == outputSize, (
-            "Input layer %s: Incorrect input image size %d for input " +
-            "image pixels %d") %
-                      (input_layer_name, conv_conf.output_x, outputSize))
+        conv_conf.output_x, conv_conf.output_y = \
+            get_img_size(input_layer_name, conv.channels)
        conv_conf.img_size = cnn_image_size(
            conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
            conv_conf.stride, conv_conf.caffe_mode)
+        conv_conf.img_size_y = cnn_image_size(
+            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
+            conv_conf.stride_y, conv_conf.caffe_mode)


 def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
@@ -1247,10 +1223,8 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf):


 def parse_maxout(maxout, input_layer_name, maxout_conf):
-    maxout_conf.channels = maxout.channels
+    parse_image(maxout, input_layer_name, maxout_conf.image_conf)
    maxout_conf.groups = maxout.groups
-    maxout_conf.img_size_x = maxout.img_size_x
-    maxout_conf.img_size_y = maxout.img_size_y


 # Define an evaluator
@@ -1377,6 +1351,12 @@ class LayerBase(object):

        g_current_submodel.layer_names.append(self.config.name)

+        if self.config.type != 'data' and g_pass_height_width:
+            height = self.get_input_layer(0).height
+            width = self.get_input_layer(0).width
+            if height and width:
+                self.set_layer_height_width(height, width)
+
    def get_input_layer(self, input_index):
        return g_layer_map[self.config.inputs[input_index].input_layer_name]

@@ -1494,6 +1474,23 @@ class LayerBase(object):
                          'Different inputs result in' +
                          'different layer size at layer %s' % self.config.name)

+    def set_layer_height_width(self, height, width):
+        self.config.height = height
+        self.config.width = width
+
+    def set_cnn_layer(self,
+                      input_layer_name,
+                      height,
+                      width,
+                      channels,
+                      is_print=True):
+        size = height * width * channels
+        self.set_layer_size(size)
+        self.set_layer_height_width(height, width)
+        if is_print:
+            print("output for %s: c = %d, h = %d, w = %d, size = %d" %
+                  (input_layer_name, channels, height, width, size))
+

 @config_layer('multi_class_cross_entropy_with_selfnorm')
 class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
@@ -1583,9 +1580,11 @@ class PrintLayer(LayerBase):

 @config_layer('data')
 class DataLayer(LayerBase):
-    def __init__(self, name, size, device=None):
+    def __init__(self, name, size, height=None, width=None, device=None):
        super(DataLayer, self).__init__(
            name, 'data', size, inputs=[], device=device)
+        if height and width:
+            self.set_layer_height_width(height, width)


 '''
@@ -1684,14 +1683,13 @@ class ConvLayerBase(LayerBase):

        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)
-            parse_conv(self.inputs[input_index].conv, input_layer.name,
-                       self.config.inputs[input_index].conv_conf, num_filters)
            conv_conf = self.config.inputs[input_index].conv_conf
+            parse_conv(self.inputs[input_index].conv, input_layer.name,
+                       conv_conf, num_filters)
            psize = self.calc_parameter_size(conv_conf)
-            print("output size for %s is %d " % (name, conv_conf.output_x))
            self.create_input_parameter(input_index, psize)
-            self.set_layer_size(
-                (conv_conf.output_x**2) * self.config.num_filters)
+            self.set_cnn_layer(name, conv_conf.output_y, conv_conf.output_x,
+                               self.config.num_filters)

        psize = self.config.size
        if shared_biases:
@@ -1778,10 +1776,11 @@ class NormLayer(LayerBase):
            name, 'norm', 0, inputs=inputs, device=device)
        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)
-            parse_norm(self.inputs[input_index].norm, input_layer.name,
-                       self.config.inputs[input_index].norm_conf)
            norm_conf = self.config.inputs[input_index].norm_conf
-            self.set_layer_size((norm_conf.output_x**2) * norm_conf.channels)
+            parse_norm(self.inputs[input_index].norm, input_layer.name,
+                       norm_conf)
+            self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
+                               norm_conf.channels, False)


 @config_layer('pool')
@@ -1791,13 +1790,11 @@ class PoolLayer(LayerBase):
            name, 'pool', 0, inputs=inputs, device=device)
        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)
-            parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       self.config.inputs[input_index].pool_conf)
            pool_conf = self.config.inputs[input_index].pool_conf
-            print("output size for %s is %d*%d " % (name, pool_conf.output_y,
-                                                    pool_conf.output_x))
-            self.set_layer_size(
-                (pool_conf.output_x * pool_conf.output_y) * pool_conf.channels)
+            parse_pool(self.inputs[input_index].pool, input_layer.name,
+                       pool_conf)
+            self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
+                               pool_conf.channels)


 @config_layer('spp')
@@ -1807,12 +1804,10 @@ class SpatialPyramidPoolLayer(LayerBase):
            name, 'spp', 0, inputs=inputs, device=device)
        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)
-            parse_spp(self.inputs[input_index].spp, input_layer.name,
-                      self.config.inputs[input_index].spp_conf)
            spp_conf = self.config.inputs[input_index].spp_conf
-            output_size = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1)
-            print("output size for %s is %d " % (name, output_size))
-            self.set_layer_size(output_size * spp_conf.channels)
+            parse_spp(self.inputs[input_index].spp, input_layer.name, spp_conf)
+            output_x = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1)
+            self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)


 @config_layer('batch_norm')
@@ -1874,10 +1869,10 @@ class BatchNormLayer(LayerBase):
            self.config.moving_average_fraction = moving_average_fraction

        input_layer = self.get_input_layer(0)
-        parse_image(self.inputs[0].image, input_layer.name,
-                    self.config.inputs[0].image_conf)
        image_conf = self.config.inputs[0].image_conf
-        self.set_layer_size((image_conf.img_size**2) * image_conf.channels)
+        parse_image(self.inputs[0].image, input_layer.name, image_conf)
+        self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
+                           image_conf.channels)

        psize = self.calc_parameter_size(image_conf)
        dims = [1, psize]
@@ -1935,11 +1930,11 @@ class MaxOutLayer(LayerBase):
        super(MaxOutLayer, self).__init__(
            name, 'maxout', 0, inputs=inputs, **xargs)
        input_layer = self.get_input_layer(0)
-        parse_maxout(self.inputs[0].maxout, input_layer.name,
-                     self.config.inputs[0].maxout_conf)
        maxout_conf = self.config.inputs[0].maxout_conf
-        self.set_layer_size(g_layer_map[input_layer.name].size /
-                            maxout_conf.groups)
+        parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf)
+        out_channels = maxout_conf.image_conf.channels / maxout_conf.groups
+        self.set_cnn_layer(name, g_layer_map[input_layer.name].height,
+                           g_layer_map[input_layer.name].width, out_channels)


 # key: cost type
@@ -2519,11 +2514,10 @@ class BilinearInterpLayer(LayerBase):
        super(BilinearInterpLayer, self).__init__(
            name, 'bilinear_interp', 0, inputs=inputs, **xargs)
        input_layer = self.get_input_layer(0)
-        parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name,
-                       self.config.inputs[0].bilinear_interp_conf)
-        conf = self.inputs[0].bilinear_interp
-        self.set_layer_size(conf.out_size_x * conf.out_size_y *
-                            conf.num_channels)
+        conf = self.config.inputs[0].bilinear_interp_conf
+        parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name, conf)
+        self.set_cnn_layer(name, conf.out_size_y, conf.out_size_x,
+                           conf.image_conf.channels)


 @config_layer('sum_to_one_norm')
@@ -2996,6 +2990,8 @@ class CTCLayer(LayerBase):
 @config_layer('recurrent_layer_group')
 class RecurrentLayerGroup(LayerBase):
    def __init__(self, name, device=None):
+        global g_pass_height_width
+        g_pass_height_width = False
        super(RecurrentLayerGroup, self).__init__(
            name, 'recurrent_layer_group', 0, inputs=[], device=device)

@@ -3381,7 +3377,20 @@ def parse_config(config_file, config_arg_str):
    g_root_submodel.is_recurrent_layer_group = False
    g_current_submodel = g_root_submodel

-    execfile(config_file, make_config_environment(config_file, config_args))
+    # for paddle on spark, need support non-file config.
+    # you can use parse_config like below:
+    #
+    # from paddle.trainer.config_parser import parse_config
+    # def configs():
+    #    #your paddle config code, which is same as config file.
+    #
+    # config = parse_config(configs, "is_predict=1")
+    # # then you get config proto object.
+    if hasattr(config_file, '__call__'):
+      config_file.func_globals.update(make_config_environment("", config_args))
+      config_file()
+    else:
+      execfile(config_file, make_config_environment(config_file, config_args))
    for k, v in settings.iteritems():
        if v is None:
            continue

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -129,6 +129,9 @@ class LayerType(object):
    HSIGMOID = 'hsigmoid'
    CONV_LAYER = "conv"
    CONVTRANS_LAYER = "convt"
+    EXCONV_LAYER = "exconv"
+    EXCONVTRANS_LAYER = "exconvt"
+    CUDNNCONV_LAYER = "cudnn_conv"
    POOL_LAYER = "pool"
    BATCH_NORM_LAYER = 'batch_norm'
    NORM_LAYER = 'norm'
@@ -763,7 +766,7 @@ def mixed_layer(size=0,


 @layer_support()
-def data_layer(name, size, layer_attr=None):
+def data_layer(name, size, height=None, width=None, layer_attr=None):
    """
    Define DataLayer For NeuralNetwork.

@@ -778,6 +781,10 @@ def data_layer(name, size, layer_attr=None):
    :type name: basestring
    :param size: Size of this data layer.
    :type size: int
+    :param height: Height of this data layer, used for image
+    :type size: int|None
+    :param width: Width of this data layer, used for image
+    :type size: int|None
    :param layer_attr: Extra Layer Attribute.
    :type layer_attr: ExtraLayerAttribute.
    :return: LayerOutput object.
@@ -787,6 +794,8 @@ def data_layer(name, size, layer_attr=None):
        type=LayerType.DATA,
        name=name,
        size=size,
+        height=height,
+        width=width,
        **ExtraLayerAttribute.to_kwargs(layer_attr))

    return LayerOutput(name, LayerType.DATA, size=size)
@@ -1480,7 +1489,7 @@ def bilinear_interp_layer(input,
            bilinear_interp=BilinearInterp(
                out_size_x=out_size_x,
                out_size_y=out_size_y,
-                num_channels=num_channels)),
+                channels=num_channels)),
        type=LayerType.BILINEAR_INTERP_LAYER,
        **ExtraLayerAttribute.to_kwargs(layer_attr))
    return LayerOutput(
@@ -1762,7 +1771,8 @@ def img_conv_layer(input,
                   filter_size_y=None,
                   stride_y=None,
                   padding_y=None,
-                   trans=False):
+                   trans=False,
+                   layer_type=None):
    """
    Convolution layer for image. Paddle only support square input currently and
    thus input image's width equals height.
@@ -1829,6 +1839,10 @@ def img_conv_layer(input,
    :type layer_attr: ExtraLayerAttribute
    :param trans: true if it is a convTransLayer, false if it is a convLayer
    :type trans: bool
+    :param layer_type: specify the layer_type, default is None. If trans=True,
+                       layer_type has to be "exconvt", otherwise layer_type 
+                       has to be either "exconv" or "cudnn_conv"
+    :type layer_type: String
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -1865,7 +1879,14 @@ def img_conv_layer(input,
        param_attr.attr["initial_strategy"] = 0
        param_attr.attr["initial_smart"] = False

-    lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER
+    if layer_type:
+        if trans:
+            assert layer_type in ["exconvt"]
+        else:
+            assert layer_type in ["exconv", "cudnn_conv"]
+        lt = layer_type
+    else:
+        lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER

    l = Layer(
        name=name,
@@ -1908,8 +1929,7 @@ def img_pool_layer(input,
                   layer_attr=None,
                   pool_size_y=None,
                   stride_y=None,
-                   padding_y=None,
-                   img_width=None):
+                   padding_y=None):
    """
    Image pooling Layer.

@@ -1940,9 +1960,6 @@ def img_pool_layer(input,
    :type stride_y: int|None
    :param layer_attr: Extra Layer attribute.
    :type layer_attr: ExtraLayerAttribute
-    :param img_width: the width of input feature map. If it is None, the input feature
-                      map should be square.
-    :type img_width: int|None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -1978,8 +1995,7 @@ def img_pool_layer(input,
                    padding=padding,
                    size_y=pool_size_y,
                    stride_y=stride_y,
-                    padding_y=padding_y,
-                    img_width=img_width))
+                    padding_y=padding_y))
        ],
        **ExtraLayerAttribute.to_kwargs(layer_attr))
    return LayerOutput(
@@ -1997,7 +2013,6 @@ def spp_layer(input,
              num_channels=None,
              pool_type=None,
              pyramid_height=None,
-              img_width=None,
              layer_attr=None):
    """
    Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition.
@@ -2014,9 +2029,6 @@ def spp_layer(input,
    :type scale: BasePoolingType
    :param pyramid_height: pyramid height.
    :type pyramid_height: int
-    :param img_width: the width of input feature map. If it is None, the input feature
-                      map should be square.
-    :type img_width: int|None
    :param layer_attr: Extra Layer Attribute.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
@@ -2043,8 +2055,7 @@ def spp_layer(input,
            spp=SpatialPyramidPool(
                pool_type=type_name,
                channels=num_channels,
-                pyramid_height=pyramid_height,
-                img_width=img_width)),
+                pyramid_height=pyramid_height)),
        **ExtraLayerAttribute.to_kwargs(layer_attr))
    return LayerOutput(
        name,

--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -4,7 +4,17 @@ add_test(NAME layers_test
        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)

-add_test(NAME test_layerHelpers
-  COMMAND
-  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
-)
+if (PROTOBUF_3)
+  add_paddle_exe(protobuf_equal
+    ProtobufEqualMain.cpp)
+  add_test(NAME test_layerHelpers
+    COMMAND
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+    ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
+  )
+else()
+  add_test(NAME test_layerHelpers
+    COMMAND
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+  )
+endif()
--- a/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
+++ b/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/util/message_differencer.h>
+#include <fstream>
+#include <iostream>
+#include "TrainerConfig.pb.h"
+
+bool loadPb(google::protobuf::Message* conf, const std::string& filename) {
+  std::ifstream fin;
+  fin.open(filename.c_str());
+  if (fin.is_open()) {
+    std::string str((std::istreambuf_iterator<char>(fin)),
+                    std::istreambuf_iterator<char>());
+    bool ok = google::protobuf::TextFormat::ParseFromString(str, conf);
+    fin.close();
+    return ok;
+  } else {
+    return false;
+  }
+}
+
+int main(int argc, char** argv) {
+  std::unique_ptr<google::protobuf::Message> config1;
+  std::unique_ptr<google::protobuf::Message> config2;
+  if (argc == 3) {
+    config1.reset(new paddle::ModelConfig());
+    config2.reset(new paddle::ModelConfig());
+  } else if (argc == 4) {
+    config1.reset(new paddle::TrainerConfig());
+    config2.reset(new paddle::TrainerConfig());
+  }
+  if (!config1 || !config2) {
+    return 1;
+  } else if (!loadPb(config1.get(), argv[1])) {
+    return 2;
+  } else if (!loadPb(config2.get(), argv[2])) {
+    return 3;
+  } else {
+    if (google::protobuf::util::MessageDifferencer::ApproximatelyEquals(
+            *config1, *config2)) {
+      return 0;
+    } else {
+      return 4;
+    }
+  }
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/.gitignore
+++ b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
-protostr/*.unitest
+protostr/*.unittest
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+#!/bin/bash
+export configs=(test_fc layer_activations projections test_print_layer
+test_sequence_pooling test_lstmemory_layer test_grumemory_layer
+last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
+img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
+test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
+test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
+
+export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -5,24 +5,16 @@ cd `dirname $0`
 export PYTHONPATH=$PWD/../../../../

 protostr=$PWD/protostr
-
-configs=(test_fc layer_activations projections test_print_layer
-test_sequence_pooling test_lstmemory_layer test_grumemory_layer
-last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
-img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
-test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
-
-whole_configs=(test_split_datasource)
+. file_list.sh

 for conf in ${configs[*]}
 do
    echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unitest
+    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
 done

 for conf in ${whole_configs[*]}
 do
    echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unitest
+    python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
 done
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -26,11 +26,15 @@ layers {
      filter_size_y: 32
      padding_y: 1
      stride_y: 1
+      output_y: 227
+      img_size_y: 256
    }
  }
  bias_parameter_name: "___conv_0__.wbias"
  num_filters: 64
  shared_biases: true
+  height: 227
+  width: 227
 }
 layers {
  name: "__batch_norm_0__"
@@ -43,6 +47,7 @@ layers {
    image_conf {
      channels: 64
      img_size: 227
+      img_size_y: 227
    }
  }
  inputs {
@@ -55,6 +60,8 @@ layers {
  }
  bias_parameter_name: "___batch_norm_0__.wbias"
  moving_average_fraction: 0.9
+  height: 227
+  width: 227
 }
 layers {
  name: "__crmnorm_0__"
@@ -72,8 +79,12 @@ layers {
      output_x: 227
      img_size: 227
      blocked: false
+      output_y: 227
+      img_size_y: 227
    }
  }
+  height: 227
+  width: 227
 }
 layers {
  name: "__pool_0__"
@@ -97,6 +108,8 @@ layers {
      padding_y: 0
    }
  }
+  height: 196
+  width: 196
 }
 parameters {
  name: "___conv_0__.w0"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -26,6 +26,8 @@ layers {
      filter_size_y: 32
      padding_y: 1
      stride_y: 1
+      output_y: 227
+      img_size_y: 256
    }
  }
  bias_parameter_name: "___conv_0__.wbias"
@@ -43,6 +45,7 @@ layers {
    image_conf {
      channels: 64
      img_size: 256
+      img_size_y: 256
    }
  }
  inputs {
@@ -55,6 +58,8 @@ layers {
  }
  bias_parameter_name: "___batch_norm_0__.wbias"
  moving_average_fraction: 0.9
+  height: 256
+  width: 256
 }
 layers {
  name: "__crmnorm_0__"
@@ -72,8 +77,12 @@ layers {
      output_x: 256
      img_size: 256
      blocked: false
+      output_y: 256
+      img_size_y: 256
    }
  }
+  height: 256
+  width: 256
 }
 layers {
  name: "__pool_0__"
@@ -97,6 +106,8 @@ layers {
      padding_y: 0
    }
  }
+  height: 225
+  width: 225
 }
 parameters {
  name: "___conv_0__.w0"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -177,6 +177,8 @@ layers {
      filter_size_y: 3
      padding_y: 0
      stride_y: 1
+      output_y: 30
+      img_size_y: 32
    }
    num_filters: 64
  }

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -26,11 +26,15 @@ layers {
      filter_size_y: 3
      padding_y: 1
      stride_y: 1
+      output_y: 48
+      img_size_y: 48
    }
  }
  bias_parameter_name: "___conv_0__.wbias"
  num_filters: 16
  shared_biases: true
+  height: 48
+  width: 48
 }
 layers {
  name: "__bilinear_interp_layer_0__"
@@ -40,11 +44,17 @@ layers {
  inputs {
    input_layer_name: "__conv_0__"
    bilinear_interp_conf {
+      image_conf {
+        channels: 16
+        img_size: 48
+        img_size_y: 48
+      }
      out_size_x: 64
      out_size_y: 64
-      num_channels: 16
    }
  }
+  height: 64
+  width: 64
 }
 layers {
  name: "__pool_0__"
@@ -55,19 +65,21 @@ layers {
    input_layer_name: "__bilinear_interp_layer_0__"
    pool_conf {
      pool_type: "max-projection"
-      channels: 4
+      channels: 16
      size_x: 2
      stride: 2
-      output_x: 64
-      img_size: 128
+      output_x: 32
+      img_size: 64
      padding: 0
      size_y: 2
      stride_y: 2
-      output_y: 64
-      img_size_y: 128
+      output_y: 32
+      img_size_y: 64
      padding_y: 0
    }
  }
+  height: 32
+  width: 32
 }
 layers {
  name: "__fc_layer_0__"
@@ -78,6 +90,8 @@ layers {
    input_layer_name: "__pool_0__"
    input_parameter_name: "___fc_layer_0__.w0"
  }
+  height: 32
+  width: 32
 }
 parameters {
  name: "___conv_0__.w0"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -4,6 +4,8 @@ layers {
  type: "data"
  size: 2304
  active_type: ""
+  height: 48
+  width: 48
 }
 layers {
  name: "__conv_0__"
@@ -26,11 +28,15 @@ layers {
      filter_size_y: 3
      padding_y: 1
      stride_y: 1
+      output_y: 48
+      img_size_y: 48
    }
  }
  bias_parameter_name: "___conv_0__.wbias"
  num_filters: 16
  shared_biases: true
+  height: 48
+  width: 48
 }
 layers {
  name: "__maxout_layer_0__"
@@ -40,12 +46,16 @@ layers {
  inputs {
    input_layer_name: "__conv_0__"
    maxout_conf {
-      channels: 16
+      image_conf {
+        channels: 16
+        img_size: 48
+        img_size_y: 48
+      }
      groups: 2
-      img_size_x: 0
-      img_size_y: 0
    }
  }
+  height: 48
+  width: 48
 }
 layers {
  name: "__pool_0__"
@@ -69,48 +79,58 @@ layers {
      padding_y: 0
    }
  }
+  height: 24
+  width: 24
 }
 layers {
  name: "__conv_1__"
  type: "exconv"
-  size: 18432
+  size: 73728
  active_type: ""
  inputs {
    input_layer_name: "__pool_0__"
    input_parameter_name: "___conv_1__.w0"
    conv_conf {
      filter_size: 3
-      channels: 32
+      channels: 8
      stride: 1
      padding: 1
      groups: 1
-      filter_channels: 32
-      output_x: 12
-      img_size: 12
+      filter_channels: 8
+      output_x: 24
+      img_size: 24
      caffe_mode: true
      filter_size_y: 3
      padding_y: 1
      stride_y: 1
+      output_y: 24
+      img_size_y: 24
    }
  }
  bias_parameter_name: "___conv_1__.wbias"
  num_filters: 128
  shared_biases: true
+  height: 24
+  width: 24
 }
 layers {
  name: "__maxout_layer_1__"
  type: "maxout"
-  size: 9216
+  size: 18432
  active_type: ""
  inputs {
-    input_layer_name: "__conv_0__"
+    input_layer_name: "__conv_1__"
    maxout_conf {
-      channels: 128
+      image_conf {
+        channels: 128
+        img_size: 24
+        img_size_y: 24
+      }
      groups: 4
-      img_size_x: 0
-      img_size_y: 0
    }
  }
+  height: 24
+  width: 24
 }
 layers {
  name: "__block_expand_layer_0__"
@@ -118,7 +138,7 @@ layers {
  size: 192
  active_type: ""
  inputs {
-    input_layer_name: "__maxout_layer_0__"
+    input_layer_name: "__maxout_layer_1__"
    block_expand_conf {
      channels: 32
      stride_x: 1
@@ -133,6 +153,8 @@ layers {
      img_size_y: 0
    }
  }
+  height: 24
+  width: 24
 }
 layers {
  name: "__fc_layer_0__"
@@ -143,6 +165,8 @@ layers {
    input_layer_name: "__block_expand_layer_0__"
    input_parameter_name: "___fc_layer_0__.w0"
  }
+  height: 24
+  width: 24
 }
 parameters {
  name: "___conv_0__.w0"
@@ -164,9 +188,9 @@ parameters {
 }
 parameters {
  name: "___conv_1__.w0"
-  size: 36864
+  size: 9216
  initial_mean: 0.0
-  initial_std: 0.0833333333333
+  initial_std: 0.166666666667
  initial_strategy: 0
  initial_smart: false
 }

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
@@ -4,6 +4,8 @@ layers {
  type: "data"
  size: 3200
  active_type: ""
+  height: 20
+  width: 10
 }
 layers {
  name: "__spp_0__"
@@ -13,13 +15,17 @@ layers {
  inputs {
    input_layer_name: "data"
    spp_conf {
+      image_conf {
+        channels: 16
+        img_size: 10
+        img_size_y: 20
+      }
      pool_type: "max-projection"
      pyramid_height: 2
-      channels: 16
-      img_size: 10
-      img_size_y: 20
    }
  }
+  height: 1
+  width: 5
 }
 input_layer_names: "data"
 output_layer_names: "__spp_0__"

--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -5,13 +5,31 @@ set -e

 protostr=`dirname $0`/protostr

-files=`ls $protostr | grep -v "unitest"`
+files=`ls $protostr | grep -v "unittest"`

 ./generate_protostr.sh

-for file in $files
-do
-    base_protostr=$protostr/$file
-    new_protostr=$protostr/$file.unitest
-    diff $base_protostr $new_protostr -u
-done
+. ./file_list.sh
+
+if [ -z $1 ]; then
+  for file in $files
+  do
+      base_protostr=$protostr/$file
+      new_protostr=$protostr/$file.unittest
+      diff $base_protostr $new_protostr -u
+  done
+else
+  for file in ${configs[*]}
+  do
+    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest; then
+      diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
+    fi
+  done
+
+  for file in ${whole_configs[*]}
+  do
+    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
+      diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
+    fi
+  done
+fi
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
@@ -17,7 +17,7 @@ bilinear = bilinear_interp_layer(input=conv, out_size_x=64, out_size_y=64)

 pool = img_pool_layer(
    input=bilinear,
-    num_channels=4,
+    num_channels=16,
    pool_size=2,
    stride=2,
    pool_type=MaxPooling())

--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
@@ -2,7 +2,7 @@ from paddle.trainer_config_helpers import *

 settings(batch_size=1000, learning_rate=1e-5)

-data = data_layer(name='data', size=2304)
+data = data_layer(name='data', size=2304, height=48, width=48)

 conv = img_conv_layer(
    input=data,
@@ -21,16 +21,21 @@ pool = img_pool_layer(
 conv2 = img_conv_layer(
    input=pool,
    filter_size=3,
-    num_channels=32,
+    num_channels=8,
    num_filters=128,
    padding=1,
    act=LinearActivation(),
    bias_attr=True)

-maxout2 = maxout_layer(input=conv, num_channels=128, groups=4)
+maxout2 = maxout_layer(input=conv2, num_channels=128, groups=4)

 block = block_expand_layer(
-    input=maxout, num_channels=32, stride_x=1, stride_y=1, block_x=1, block_y=6)
+    input=maxout2,
+    num_channels=32,
+    stride_x=1,
+    stride_y=1,
+    block_x=1,
+    block_y=6)

 fc = fc_layer(input=block, size=384, bias_attr=False)


--- a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
@@ -2,13 +2,9 @@ from paddle.trainer_config_helpers import *

 settings(batch_size=100, learning_rate=1e-5)

-data = data_layer(name='data', size=3200)
+data = data_layer(name='data', size=3200, height=20, width=10)

 spp = spp_layer(
-    input=data,
-    pyramid_height=2,
-    num_channels=16,
-    pool_type=MaxPooling(),
-    img_width=10)
+    input=data, pyramid_height=2, num_channels=16, pool_type=MaxPooling())

 outputs(spp)