remove benchmark folder, since there is a benchmark repo already, distributed...

remove benchmark folder, since there is a benchmark repo already, distributed benchmark will be maintained in fleet repo (#18537) test=develop

remove benchmark folder, since there is a benchmark repo already, distributed...
remove benchmark folder, since there is a benchmark repo already, distributed benchmark will be maintained in fleet repo (#18537) test=develop
6f6ecbec · guru4elephant · Tao Luo · 1f1cc222 · 1f1cc222 · 1f1cc222
59 changed file
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
-paddle/image/logs
-paddle/image/*.pyc
-paddle/image/train.list
-paddle/rnn/logs
-paddle/rnn/*.pyc
-paddle/rnn/imdb.pkl
-caffe/image/logs
-tensorflow/image/logs
-tensorflow/rnn/logs
-fluid/models/*.pyc
-fluid/logs
-fluid/nohup.out
--- a/benchmark/caffe/image/alexnet.prototxt
+++ b/benchmark/caffe/image/alexnet.prototxt
-name: "alexnet"
-input: "data"
-input_dim: 64
-input_dim: 3
-input_dim: 227
-input_dim: 227
-input: "label"
-input_dim: 64
-input_dim: 1
-input_dim: 1
-input_dim: 1 
-force_backward: true
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "drop6"
-  type: "Dropout"
-  bottom: "fc6"
-  top: "fc6"
-  dropout_param {
-    dropout_ratio: 0.5
-  }
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "drop7"
-  type: "Dropout"
-  bottom: "fc7"
-  top: "fc7"
-  dropout_param {
-    dropout_ratio: 0.5
-  }
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
--- a/benchmark/caffe/image/googlenet.prototxt
+++ b/benchmark/caffe/image/googlenet.prototxt
-name: "googlenet"
-input: "data"
-input_dim: 128
-input_dim: 3
-input_dim: 224
-input_dim: 224
-input: "label"
-input_dim: 128
-input_dim: 1
-input_dim: 1
-input_dim: 1 
-layer {
-  name: "conv1/7x7_s2"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1/7x7_s2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 3
-    kernel_size: 7
-    stride: 2
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "conv1/relu_7x7"
-  type: "ReLU"
-  bottom: "conv1/7x7_s2"
-  top: "conv1/7x7_s2"
-}
-layer {
-  name: "pool1/3x3_s2"
-  type: "Pooling"
-  bottom: "conv1/7x7_s2"
-  top: "pool1/3x3_s2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-#layer {
-#  name: "pool1/norm1"
-#  type: "LRN"
-#  bottom: "pool1/3x3_s2"
-#  top: "pool1/norm1"
-#  lrn_param {
-#    local_size: 5
-#    alpha: 0.0001
-#    beta: 0.75
-#  }
-#}
-layer {
-  name: "conv2/3x3_reduce"
-  type: "Convolution"
-#  bottom: "pool1/norm1"
-  bottom: "pool1/3x3_s2"
-  top: "conv2/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "conv2/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "conv2/3x3_reduce"
-  top: "conv2/3x3_reduce"
-}
-layer {
-  name: "conv2/3x3"
-  type: "Convolution"
-  bottom: "conv2/3x3_reduce"
-  top: "conv2/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 192
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "conv2/relu_3x3"
-  type: "ReLU"
-  bottom: "conv2/3x3"
-  top: "conv2/3x3"
-}
-#layer {
-#  name: "conv2/norm2"
-#  type: "LRN"
-#  bottom: "conv2/3x3"
-#  top: "conv2/norm2"
-#  lrn_param {
-#    local_size: 5
-#    alpha: 0.0001
-#    beta: 0.75
-#  }
-#}
-layer {
-  name: "pool2/3x3_s2"
-  type: "Pooling"
-#  bottom: "conv2/norm2"
-  bottom: "conv2/3x3"
-  top: "pool2/3x3_s2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "inception_3a/1x1"
-  type: "Convolution"
-  bottom: "pool2/3x3_s2"
-  top: "inception_3a/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_3a/1x1"
-  top: "inception_3a/1x1"
-}
-layer {
-  name: "inception_3a/3x3_reduce"
-  type: "Convolution"
-  bottom: "pool2/3x3_s2"
-  top: "inception_3a/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_3a/3x3_reduce"
-  top: "inception_3a/3x3_reduce"
-}
-layer {
-  name: "inception_3a/3x3"
-  type: "Convolution"
-  bottom: "inception_3a/3x3_reduce"
-  top: "inception_3a/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_3a/3x3"
-  top: "inception_3a/3x3"
-}
-layer {
-  name: "inception_3a/5x5_reduce"
-  type: "Convolution"
-  bottom: "pool2/3x3_s2"
-  top: "inception_3a/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 16
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_3a/5x5_reduce"
-  top: "inception_3a/5x5_reduce"
-}
-layer {
-  name: "inception_3a/5x5"
-  type: "Convolution"
-  bottom: "inception_3a/5x5_reduce"
-  top: "inception_3a/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_3a/5x5"
-  top: "inception_3a/5x5"
-}
-layer {
-  name: "inception_3a/pool"
-  type: "Pooling"
-  bottom: "pool2/3x3_s2"
-  top: "inception_3a/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_3a/pool_proj"
-  type: "Convolution"
-  bottom: "inception_3a/pool"
-  top: "inception_3a/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_3a/pool_proj"
-  top: "inception_3a/pool_proj"
-}
-layer {
-  name: "inception_3a/output"
-  type: "Concat"
-  bottom: "inception_3a/1x1"
-  bottom: "inception_3a/3x3"
-  bottom: "inception_3a/5x5"
-  bottom: "inception_3a/pool_proj"
-  top: "inception_3a/output"
-}
-layer {
-  name: "inception_3b/1x1"
-  type: "Convolution"
-  bottom: "inception_3a/output"
-  top: "inception_3b/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_3b/1x1"
-  top: "inception_3b/1x1"
-}
-layer {
-  name: "inception_3b/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_3a/output"
-  top: "inception_3b/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_3b/3x3_reduce"
-  top: "inception_3b/3x3_reduce"
-}
-layer {
-  name: "inception_3b/3x3"
-  type: "Convolution"
-  bottom: "inception_3b/3x3_reduce"
-  top: "inception_3b/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 192
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_3b/3x3"
-  top: "inception_3b/3x3"
-}
-layer {
-  name: "inception_3b/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_3a/output"
-  top: "inception_3b/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_3b/5x5_reduce"
-  top: "inception_3b/5x5_reduce"
-}
-layer {
-  name: "inception_3b/5x5"
-  type: "Convolution"
-  bottom: "inception_3b/5x5_reduce"
-  top: "inception_3b/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_3b/5x5"
-  top: "inception_3b/5x5"
-}
-layer {
-  name: "inception_3b/pool"
-  type: "Pooling"
-  bottom: "inception_3a/output"
-  top: "inception_3b/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_3b/pool_proj"
-  type: "Convolution"
-  bottom: "inception_3b/pool"
-  top: "inception_3b/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_3b/pool_proj"
-  top: "inception_3b/pool_proj"
-}
-layer {
-  name: "inception_3b/output"
-  type: "Concat"
-  bottom: "inception_3b/1x1"
-  bottom: "inception_3b/3x3"
-  bottom: "inception_3b/5x5"
-  bottom: "inception_3b/pool_proj"
-  top: "inception_3b/output"
-}
-layer {
-  name: "pool3/3x3_s2"
-  type: "Pooling"
-  bottom: "inception_3b/output"
-  top: "pool3/3x3_s2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "inception_4a/1x1"
-  type: "Convolution"
-  bottom: "pool3/3x3_s2"
-  top: "inception_4a/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 192
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_4a/1x1"
-  top: "inception_4a/1x1"
-}
-layer {
-  name: "inception_4a/3x3_reduce"
-  type: "Convolution"
-  bottom: "pool3/3x3_s2"
-  top: "inception_4a/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_4a/3x3_reduce"
-  top: "inception_4a/3x3_reduce"
-}
-layer {
-  name: "inception_4a/3x3"
-  type: "Convolution"
-  bottom: "inception_4a/3x3_reduce"
-  top: "inception_4a/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 208
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_4a/3x3"
-  top: "inception_4a/3x3"
-}
-layer {
-  name: "inception_4a/5x5_reduce"
-  type: "Convolution"
-  bottom: "pool3/3x3_s2"
-  top: "inception_4a/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 16
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_4a/5x5_reduce"
-  top: "inception_4a/5x5_reduce"
-}
-layer {
-  name: "inception_4a/5x5"
-  type: "Convolution"
-  bottom: "inception_4a/5x5_reduce"
-  top: "inception_4a/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 48
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_4a/5x5"
-  top: "inception_4a/5x5"
-}
-layer {
-  name: "inception_4a/pool"
-  type: "Pooling"
-  bottom: "pool3/3x3_s2"
-  top: "inception_4a/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_4a/pool_proj"
-  type: "Convolution"
-  bottom: "inception_4a/pool"
-  top: "inception_4a/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_4a/pool_proj"
-  top: "inception_4a/pool_proj"
-}
-layer {
-  name: "inception_4a/output"
-  type: "Concat"
-  bottom: "inception_4a/1x1"
-  bottom: "inception_4a/3x3"
-  bottom: "inception_4a/5x5"
-  bottom: "inception_4a/pool_proj"
-  top: "inception_4a/output"
-}
-#layer {
-#  name: "loss1/ave_pool"
-#  type: "Pooling"
-#  bottom: "inception_4a/output"
-#  top: "loss1/ave_pool"
-#  pooling_param {
-#    pool: AVE
-#    kernel_size: 5
-#    stride: 3
-#  }
-#}
-#layer {
-#  name: "loss1/conv"
-#  type: "Convolution"
-#  bottom: "loss1/ave_pool"
-#  top: "loss1/conv"
-#  param {
-#    lr_mult: 1
-#    decay_mult: 1
-#  }
-#  param {
-#    lr_mult: 2
-#    decay_mult: 0
-#  }
-#  convolution_param {
-#    num_output: 128
-#    kernel_size: 1
-#    weight_filler {
-#      type: "xavier"
-#    }
-#    bias_filler {
-#      type: "constant"
-#      value: 0.2
-#    }
-#  }
-#}
-#layer {
-#  name: "loss1/relu_conv"
-#  type: "ReLU"
-#  bottom: "loss1/conv"
-#  top: "loss1/conv"
-#}
-#layer {
-#  name: "loss1/fc"
-#  type: "InnerProduct"
-#  bottom: "loss1/conv"
-#  top: "loss1/fc"
-#  param {
-#    lr_mult: 1
-#    decay_mult: 1
-#  }
-#  param {
-#    lr_mult: 2
-#    decay_mult: 0
-#  }
-#  inner_product_param {
-#    num_output: 1024
-#    weight_filler {
-#      type: "xavier"
-#    }
-#    bias_filler {
-#      type: "constant"
-#      value: 0.2
-#    }
-#  }
-#}
-#layer {
-#  name: "loss1/relu_fc"
-#  type: "ReLU"
-#  bottom: "loss1/fc"
-#  top: "loss1/fc"
-#}
-#layer {
-#  name: "loss1/drop_fc"
-#  type: "Dropout"
-#  bottom: "loss1/fc"
-#  top: "loss1/fc"
-#  dropout_param {
-#    dropout_ratio: 0.7
-#  }
-#}
-#layer {
-#  name: "loss1/classifier"
-#  type: "InnerProduct"
-#  bottom: "loss1/fc"
-#  top: "loss1/classifier"
-#  param {
-#    lr_mult: 1
-#    decay_mult: 1
-#  }
-#  param {
-#    lr_mult: 2
-#    decay_mult: 0
-#  }
-#  inner_product_param {
-#    num_output: 1000
-#    weight_filler {
-#      type: "xavier"
-#    }
-#    bias_filler {
-#      type: "constant"
-#      value: 0
-#    }
-#  }
-#}
-#layer {
-#  name: "loss1/loss"
-#  type: "SoftmaxWithLoss"
-#  bottom: "loss1/classifier"
-#  bottom: "label"
-#  top: "loss1/loss1"
-#  loss_weight: 0.3
-#}
-layer {
-  name: "inception_4b/1x1"
-  type: "Convolution"
-  bottom: "inception_4a/output"
-  top: "inception_4b/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 160
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_4b/1x1"
-  top: "inception_4b/1x1"
-}
-layer {
-  name: "inception_4b/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_4a/output"
-  top: "inception_4b/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 112
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_4b/3x3_reduce"
-  top: "inception_4b/3x3_reduce"
-}
-layer {
-  name: "inception_4b/3x3"
-  type: "Convolution"
-  bottom: "inception_4b/3x3_reduce"
-  top: "inception_4b/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 224
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_4b/3x3"
-  top: "inception_4b/3x3"
-}
-layer {
-  name: "inception_4b/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_4a/output"
-  top: "inception_4b/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 24
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_4b/5x5_reduce"
-  top: "inception_4b/5x5_reduce"
-}
-layer {
-  name: "inception_4b/5x5"
-  type: "Convolution"
-  bottom: "inception_4b/5x5_reduce"
-  top: "inception_4b/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_4b/5x5"
-  top: "inception_4b/5x5"
-}
-layer {
-  name: "inception_4b/pool"
-  type: "Pooling"
-  bottom: "inception_4a/output"
-  top: "inception_4b/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_4b/pool_proj"
-  type: "Convolution"
-  bottom: "inception_4b/pool"
-  top: "inception_4b/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_4b/pool_proj"
-  top: "inception_4b/pool_proj"
-}
-layer {
-  name: "inception_4b/output"
-  type: "Concat"
-  bottom: "inception_4b/1x1"
-  bottom: "inception_4b/3x3"
-  bottom: "inception_4b/5x5"
-  bottom: "inception_4b/pool_proj"
-  top: "inception_4b/output"
-}
-layer {
-  name: "inception_4c/1x1"
-  type: "Convolution"
-  bottom: "inception_4b/output"
-  top: "inception_4c/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_4c/1x1"
-  top: "inception_4c/1x1"
-}
-layer {
-  name: "inception_4c/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_4b/output"
-  top: "inception_4c/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_4c/3x3_reduce"
-  top: "inception_4c/3x3_reduce"
-}
-layer {
-  name: "inception_4c/3x3"
-  type: "Convolution"
-  bottom: "inception_4c/3x3_reduce"
-  top: "inception_4c/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_4c/3x3"
-  top: "inception_4c/3x3"
-}
-layer {
-  name: "inception_4c/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_4b/output"
-  top: "inception_4c/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 24
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_4c/5x5_reduce"
-  top: "inception_4c/5x5_reduce"
-}
-layer {
-  name: "inception_4c/5x5"
-  type: "Convolution"
-  bottom: "inception_4c/5x5_reduce"
-  top: "inception_4c/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_4c/5x5"
-  top: "inception_4c/5x5"
-}
-layer {
-  name: "inception_4c/pool"
-  type: "Pooling"
-  bottom: "inception_4b/output"
-  top: "inception_4c/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_4c/pool_proj"
-  type: "Convolution"
-  bottom: "inception_4c/pool"
-  top: "inception_4c/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_4c/pool_proj"
-  top: "inception_4c/pool_proj"
-}
-layer {
-  name: "inception_4c/output"
-  type: "Concat"
-  bottom: "inception_4c/1x1"
-  bottom: "inception_4c/3x3"
-  bottom: "inception_4c/5x5"
-  bottom: "inception_4c/pool_proj"
-  top: "inception_4c/output"
-}
-layer {
-  name: "inception_4d/1x1"
-  type: "Convolution"
-  bottom: "inception_4c/output"
-  top: "inception_4d/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 112
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_4d/1x1"
-  top: "inception_4d/1x1"
-}
-layer {
-  name: "inception_4d/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_4c/output"
-  top: "inception_4d/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 144
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_4d/3x3_reduce"
-  top: "inception_4d/3x3_reduce"
-}
-layer {
-  name: "inception_4d/3x3"
-  type: "Convolution"
-  bottom: "inception_4d/3x3_reduce"
-  top: "inception_4d/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 288
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_4d/3x3"
-  top: "inception_4d/3x3"
-}
-layer {
-  name: "inception_4d/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_4c/output"
-  top: "inception_4d/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_4d/5x5_reduce"
-  top: "inception_4d/5x5_reduce"
-}
-layer {
-  name: "inception_4d/5x5"
-  type: "Convolution"
-  bottom: "inception_4d/5x5_reduce"
-  top: "inception_4d/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_4d/5x5"
-  top: "inception_4d/5x5"
-}
-layer {
-  name: "inception_4d/pool"
-  type: "Pooling"
-  bottom: "inception_4c/output"
-  top: "inception_4d/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_4d/pool_proj"
-  type: "Convolution"
-  bottom: "inception_4d/pool"
-  top: "inception_4d/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_4d/pool_proj"
-  top: "inception_4d/pool_proj"
-}
-layer {
-  name: "inception_4d/output"
-  type: "Concat"
-  bottom: "inception_4d/1x1"
-  bottom: "inception_4d/3x3"
-  bottom: "inception_4d/5x5"
-  bottom: "inception_4d/pool_proj"
-  top: "inception_4d/output"
-}
-#layer {
-#  name: "loss2/ave_pool"
-#  type: "Pooling"
-#  bottom: "inception_4d/output"
-#  top: "loss2/ave_pool"
-#  pooling_param {
-#    pool: AVE
-#    kernel_size: 5
-#    stride: 3
-#  }
-#}
-#layer {
-#  name: "loss2/conv"
-#  type: "Convolution"
-#  bottom: "loss2/ave_pool"
-#  top: "loss2/conv"
-#  param {
-#    lr_mult: 1
-#    decay_mult: 1
-#  }
-#  param {
-#    lr_mult: 2
-#    decay_mult: 0
-#  }
-#  convolution_param {
-#    num_output: 128
-#    kernel_size: 1
-#    weight_filler {
-#      type: "xavier"
-#    }
-#    bias_filler {
-#      type: "constant"
-#      value: 0.2
-#    }
-#  }
-#}
-#layer {
-#  name: "loss2/relu_conv"
-#  type: "ReLU"
-#  bottom: "loss2/conv"
-#  top: "loss2/conv"
-#}
-#layer {
-#  name: "loss2/fc"
-#  type: "InnerProduct"
-#  bottom: "loss2/conv"
-#  top: "loss2/fc"
-#  param {
-#    lr_mult: 1
-#    decay_mult: 1
-#  }
-#  param {
-#    lr_mult: 2
-#    decay_mult: 0
-#  }
-#  inner_product_param {
-#    num_output: 1024
-#    weight_filler {
-#      type: "xavier"
-#    }
-#    bias_filler {
-#      type: "constant"
-#      value: 0.2
-#    }
-#  }
-#}
-#layer {
-#  name: "loss2/relu_fc"
-#  type: "ReLU"
-#  bottom: "loss2/fc"
-#  top: "loss2/fc"
-#}
-#layer {
-#  name: "loss2/drop_fc"
-#  type: "Dropout"
-#  bottom: "loss2/fc"
-#  top: "loss2/fc"
-#  dropout_param {
-#    dropout_ratio: 0.7
-#  }
-#}
-#layer {
-#  name: "loss2/classifier"
-#  type: "InnerProduct"
-#  bottom: "loss2/fc"
-#  top: "loss2/classifier"
-#  param {
-#    lr_mult: 1
-#    decay_mult: 1
-#  }
-#  param {
-#    lr_mult: 2
-#    decay_mult: 0
-#  }
-#  inner_product_param {
-#    num_output: 1000
-#    weight_filler {
-#      type: "xavier"
-#    }
-#    bias_filler {
-#      type: "constant"
-#      value: 0
-#    }
-#  }
-#}
-#layer {
-#  name: "loss2/loss"
-#  type: "SoftmaxWithLoss"
-#  bottom: "loss2/classifier"
-#  bottom: "label"
-#  top: "loss2/loss1"
-#  loss_weight: 0.3
-#}
-layer {
-  name: "inception_4e/1x1"
-  type: "Convolution"
-  bottom: "inception_4d/output"
-  top: "inception_4e/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_4e/1x1"
-  top: "inception_4e/1x1"
-}
-layer {
-  name: "inception_4e/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_4d/output"
-  top: "inception_4e/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 160
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_4e/3x3_reduce"
-  top: "inception_4e/3x3_reduce"
-}
-layer {
-  name: "inception_4e/3x3"
-  type: "Convolution"
-  bottom: "inception_4e/3x3_reduce"
-  top: "inception_4e/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 320
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_4e/3x3"
-  top: "inception_4e/3x3"
-}
-layer {
-  name: "inception_4e/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_4d/output"
-  top: "inception_4e/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_4e/5x5_reduce"
-  top: "inception_4e/5x5_reduce"
-}
-layer {
-  name: "inception_4e/5x5"
-  type: "Convolution"
-  bottom: "inception_4e/5x5_reduce"
-  top: "inception_4e/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_4e/5x5"
-  top: "inception_4e/5x5"
-}
-layer {
-  name: "inception_4e/pool"
-  type: "Pooling"
-  bottom: "inception_4d/output"
-  top: "inception_4e/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_4e/pool_proj"
-  type: "Convolution"
-  bottom: "inception_4e/pool"
-  top: "inception_4e/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_4e/pool_proj"
-  top: "inception_4e/pool_proj"
-}
-layer {
-  name: "inception_4e/output"
-  type: "Concat"
-  bottom: "inception_4e/1x1"
-  bottom: "inception_4e/3x3"
-  bottom: "inception_4e/5x5"
-  bottom: "inception_4e/pool_proj"
-  top: "inception_4e/output"
-}
-layer {
-  name: "pool4/3x3_s2"
-  type: "Pooling"
-  bottom: "inception_4e/output"
-  top: "pool4/3x3_s2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "inception_5a/1x1"
-  type: "Convolution"
-  bottom: "pool4/3x3_s2"
-  top: "inception_5a/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_5a/1x1"
-  top: "inception_5a/1x1"
-}
-layer {
-  name: "inception_5a/3x3_reduce"
-  type: "Convolution"
-  bottom: "pool4/3x3_s2"
-  top: "inception_5a/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 160
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_5a/3x3_reduce"
-  top: "inception_5a/3x3_reduce"
-}
-layer {
-  name: "inception_5a/3x3"
-  type: "Convolution"
-  bottom: "inception_5a/3x3_reduce"
-  top: "inception_5a/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 320
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_5a/3x3"
-  top: "inception_5a/3x3"
-}
-layer {
-  name: "inception_5a/5x5_reduce"
-  type: "Convolution"
-  bottom: "pool4/3x3_s2"
-  top: "inception_5a/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_5a/5x5_reduce"
-  top: "inception_5a/5x5_reduce"
-}
-layer {
-  name: "inception_5a/5x5"
-  type: "Convolution"
-  bottom: "inception_5a/5x5_reduce"
-  top: "inception_5a/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_5a/5x5"
-  top: "inception_5a/5x5"
-}
-layer {
-  name: "inception_5a/pool"
-  type: "Pooling"
-  bottom: "pool4/3x3_s2"
-  top: "inception_5a/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_5a/pool_proj"
-  type: "Convolution"
-  bottom: "inception_5a/pool"
-  top: "inception_5a/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_5a/pool_proj"
-  top: "inception_5a/pool_proj"
-}
-layer {
-  name: "inception_5a/output"
-  type: "Concat"
-  bottom: "inception_5a/1x1"
-  bottom: "inception_5a/3x3"
-  bottom: "inception_5a/5x5"
-  bottom: "inception_5a/pool_proj"
-  top: "inception_5a/output"
-}
-layer {
-  name: "inception_5b/1x1"
-  type: "Convolution"
-  bottom: "inception_5a/output"
-  top: "inception_5b/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_5b/1x1"
-  top: "inception_5b/1x1"
-}
-layer {
-  name: "inception_5b/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_5a/output"
-  top: "inception_5b/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 192
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_5b/3x3_reduce"
-  top: "inception_5b/3x3_reduce"
-}
-layer {
-  name: "inception_5b/3x3"
-  type: "Convolution"
-  bottom: "inception_5b/3x3_reduce"
-  top: "inception_5b/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_5b/3x3"
-  top: "inception_5b/3x3"
-}
-layer {
-  name: "inception_5b/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_5a/output"
-  top: "inception_5b/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 48
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_5b/5x5_reduce"
-  top: "inception_5b/5x5_reduce"
-}
-layer {
-  name: "inception_5b/5x5"
-  type: "Convolution"
-  bottom: "inception_5b/5x5_reduce"
-  top: "inception_5b/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_5b/5x5"
-  top: "inception_5b/5x5"
-}
-layer {
-  name: "inception_5b/pool"
-  type: "Pooling"
-  bottom: "inception_5a/output"
-  top: "inception_5b/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_5b/pool_proj"
-  type: "Convolution"
-  bottom: "inception_5b/pool"
-  top: "inception_5b/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.2
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_5b/pool_proj"
-  top: "inception_5b/pool_proj"
-}
-layer {
-  name: "inception_5b/output"
-  type: "Concat"
-  bottom: "inception_5b/1x1"
-  bottom: "inception_5b/3x3"
-  bottom: "inception_5b/5x5"
-  bottom: "inception_5b/pool_proj"
-  top: "inception_5b/output"
-}
-layer {
-  name: "pool5/7x7_s1"
-  type: "Pooling"
-  bottom: "inception_5b/output"
-  top: "pool5/7x7_s1"
-  pooling_param {
-    pool: AVE
-    kernel_size: 7
-    stride: 1
-  }
-}
-layer {
-  name: "pool5/drop_7x7_s1"
-  type: "Dropout"
-  bottom: "pool5/7x7_s1"
-  top: "pool5/7x7_s1"
-  dropout_param {
-    dropout_ratio: 0.4
-  }
-}
-layer {
-  name: "loss3/classifier"
-  type: "InnerProduct"
-  bottom: "pool5/7x7_s1"
-  top: "loss3/classifier"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "loss3/loss3"
-  type: "SoftmaxWithLoss"
-  bottom: "loss3/classifier"
-  bottom: "label"
-  top: "loss3/loss3"
-  loss_weight: 1
-}
--- a/benchmark/caffe/image/run.sh
+++ b/benchmark/caffe/image/run.sh
-set -e
-
-function test() {
-  cfg=$1
-  batch=$2
-  prefix=$3
-  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg 
-  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
-  caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# alexnet
-test alexnet.prototxt 64 alexnet 
-test alexnet.prototxt 128 alexnet 
-test alexnet.prototxt 256 alexnet 
-test alexnet.prototxt 512 alexnet 
-
-# googlenet
-test googlenet.prototxt 64 googlenet 
-test googlenet.prototxt 128 googlenet 
-
-# small net 
-test smallnet_mnist_cifar.prototxt 64 smallnet 
-test smallnet_mnist_cifar.prototxt 128 smallnet 
-test smallnet_mnist_cifar.prototxt 256 smallnet 
-test smallnet_mnist_cifar.prototxt 512 smallnet 
--- a/benchmark/caffe/image/run_multi.sh
+++ b/benchmark/caffe/image/run_multi.sh
-#!/bin/bash
-set -e
-
-function test() {
-  cfg=$1
-  batch=$2
-  prefix=$3
-  batch_per_gpu=`expr ${batch} / 4`
-  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
-  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
-  sed -i "1c\net : \"${cfg}\"" solver.prototxt
-  caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# alexnet
-test alexnet.prototxt 512 alexnet 
-test alexnet.prototxt 1024 alexnet 
-
-# googlnet 
-test googlenet.prototxt 512 googlenet 
--- a/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
+++ b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
-name: "mnist/cifar"
-input: "data"
-input_dim: 128 
-input_dim: 3
-input_dim: 32 
-input_dim: 32 
-input: "label"
-input_dim: 128 
-input_dim: 1
-input_dim: 1
-input_dim: 1 
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  convolution_param {
-    num_output: 32
-    pad: 2
-    kernel_size: 5
-    stride: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.0001
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "conv1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "pool1"
-  top: "pool1"
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  convolution_param {
-    num_output: 32
-    pad: 2
-    kernel_size: 5
-    stride: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "conv2"
-  top: "pool2"
-  pooling_param {
-    pool: AVE
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  convolution_param {
-    num_output: 64
-    pad: 2
-    kernel_size: 5
-    stride: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "pool3"
-  type: "Pooling"
-  bottom: "conv3"
-  top: "pool3"
-  pooling_param {
-    pool: AVE
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "ip1"
-  type: "InnerProduct"
-  bottom: "pool3"
-  top: "ip1"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  inner_product_param {
-    num_output: 64
-    weight_filler {
-      type: "gaussian"
-      std: 0.1
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "ip2"
-  type: "InnerProduct"
-  bottom: "ip1"
-  top: "ip2"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  inner_product_param {
-    num_output: 10
-    weight_filler {
-      type: "gaussian"
-      std: 0.1
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "accuracy"
-  type: "Accuracy"
-  bottom: "ip2"
-  bottom: "label"
-  top: "accuracy"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "ip2"
-  bottom: "label"
-  top: "loss"
-}
--- a/benchmark/caffe/image/solver.prototxt
+++ b/benchmark/caffe/image/solver.prototxt
-net: "alexnet.prototxt"
-base_lr: 0.01
-lr_policy: "fixed"
-display: 20
-max_iter: 200
-momentum: 0.9
-weight_decay: 0.0005
-snapshot: 10000
-snapshot_prefix: "models/caffe_alexnet_train"
-solver_mode: GPU
--- a/benchmark/figs/alexnet-4gpu.png
+++ b/benchmark/figs/alexnet-4gpu.png
--- a/benchmark/figs/alexnet-cpu-infer.png
+++ b/benchmark/figs/alexnet-cpu-infer.png
--- a/benchmark/figs/alexnet-cpu-train.png
+++ b/benchmark/figs/alexnet-cpu-train.png
--- a/benchmark/figs/googlenet-4gpu.png
+++ b/benchmark/figs/googlenet-4gpu.png
--- a/benchmark/figs/googlenet-cpu-infer.png
+++ b/benchmark/figs/googlenet-cpu-infer.png
--- a/benchmark/figs/googlenet-cpu-train.png
+++ b/benchmark/figs/googlenet-cpu-train.png
--- a/benchmark/figs/resnet-cpu-infer.png
+++ b/benchmark/figs/resnet-cpu-infer.png
--- a/benchmark/figs/resnet-cpu-train.png
+++ b/benchmark/figs/resnet-cpu-train.png
--- a/benchmark/figs/rnn_lstm_4gpus.png
+++ b/benchmark/figs/rnn_lstm_4gpus.png
--- a/benchmark/figs/rnn_lstm_cls.png
+++ b/benchmark/figs/rnn_lstm_cls.png
--- a/benchmark/figs/vgg-cpu-infer.png
+++ b/benchmark/figs/vgg-cpu-infer.png
--- a/benchmark/figs/vgg-cpu-train.png
+++ b/benchmark/figs/vgg-cpu-train.png
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-
-# Use UBUNTU_MIRROR can speed up apt-get speed.
-# ARG UBUNTU_MIRROR
-# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
-RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
-
-# IMPORTANT:
-# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
-# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
-
-
-RUN pip install -U pip
-RUN pip install -U kubernetes paddlepaddle
-
-RUN pip uninstall -y paddlepaddle && mkdir /workspace
-
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN chmod +x /usr/bin/paddle_k8s
-
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl 
-
-ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
-ADD models/ /workspace/models/
-
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
-# Fluid Benchmark
-
-This directory contains several models configurations and tools that used to run
-Fluid benchmarks for local and distributed training.
-
-
-## Run the Benchmark
-
-To start, run the following command to get the full help message:
-
-```bash
-python fluid_benchmark.py --help
-```
-
-Currently supported `--model` argument include:
-
-* mnist
-* resnet
-    * you can chose to use different dataset using `--data_set cifar10` or
-      `--data_set flowers`.
-* vgg
-* stacked_dynamic_lstm
-* machine_translation
-
-* Run the following command to start a benchmark job locally:
-    ```bash
-      python fluid_benchmark.py --model mnist --device GPU
-    ```
-    You can choose to use GPU/CPU training. With GPU training, you can specify
-    `--gpus <gpu_num>` to run multi GPU training.
-    You can set async mode parameter server. With async mode, you can specify
-    `--async_mode` to train model asynchronous.
-* Run distributed training with parameter servers:
-    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
-    * start parameter servers:
-        ```bash
-        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
-        sleep 15
-        ```
-    * start trainers:
-        ```bash
-        PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
-        ```
-* Run distributed training using NCCL2
-    ```bash
-    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
-    ```
-
-## Prepare the RecordIO file to Achieve Better Performance
-
-Run the following command will generate RecordIO files like "mnist.recordio" under the path
-and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
-at any time using `fluid.batch`.
-
-```bash
-python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
-```
-
-## Run Distributed Benchmark on Kubernetes Cluster
-
-You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
-have to start all those processes manually on each node, which is not recommended.
-
-To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
-download it from
-http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
-build it by your own. Once you've got the "whl" package, put it under the current directory and run:
-
-```bash
-docker build -t [your docker image name]:[your docker image tag] .
-```
-
-Then push the image to a Docker registry that your Kubernetes cluster can reach.
-
-We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
-distributed benchmark jobs to your cluster. To generate a job yaml, just run:
-
-```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
-```
-
-Then the yaml files are generated under directory `myjob`, you can run:
-
-```bash
-kubectl create -f myjob/
-```
-
-The job shall start.
-
-
-## Notes for Run Fluid Distributed with NCCL2 and RDMA
-
-Before running NCCL2 distributed jobs, please check that whether your node has multiple network
-interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
-network device.
-
-To run high-performance distributed training, you must prepare your hardware environment to be
-able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
-note for details.
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-__all__ = ['parse_args', ]
-
-BENCHMARK_MODELS = [
-    "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
-    "stacked_dynamic_lstm", "resnet_with_preprocess"
-]
-
-
-def parse_args():
-    parser = argparse.ArgumentParser('Fluid model benchmarks.')
-    parser.add_argument(
-        '--model',
-        type=str,
-        choices=BENCHMARK_MODELS,
-        default='resnet',
-        help='The model to run benchmark with.')
-    parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
-    #  args related to learning rate
-    parser.add_argument(
-        '--learning_rate', type=float, default=0.001, help='The learning rate.')
-    # TODO(wuyi): add "--use_fake_data" option back.
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=100, help='The number of passes.')
-    parser.add_argument(
-        '--data_format',
-        type=str,
-        default='NCHW',
-        choices=['NCHW', 'NHWC'],
-        help='The data data_format, now only support NCHW.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--gpus',
-        type=int,
-        default=1,
-        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
-    # this option is available only for vgg and resnet.
-    parser.add_argument(
-        '--cpus',
-        type=int,
-        default=1,
-        help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
-    parser.add_argument(
-        '--data_set',
-        type=str,
-        default='flowers',
-        choices=['cifar10', 'flowers', 'imagenet'],
-        help='Optional dataset for benchmark.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--no_test',
-        action='store_true',
-        help='If set, do not test the testset during training.')
-    parser.add_argument(
-        '--memory_optimize',
-        action='store_true',
-        help='If set, optimize runtime memory before start.')
-    parser.add_argument(
-        '--use_fake_data',
-        action='store_true',
-        help='If set ommit the actual read data operators.')
-    parser.add_argument(
-        '--profile', action='store_true', help='If set, profile a few steps.')
-    parser.add_argument(
-        '--update_method',
-        type=str,
-        default='local',
-        choices=['local', 'pserver', 'nccl2'],
-        help='Choose parameter update method, can be local, pserver, nccl2.')
-    parser.add_argument(
-        '--no_split_var',
-        action='store_true',
-        default=False,
-        help='Whether split variables into blocks when update_method is pserver')
-    parser.add_argument(
-        '--async_mode',
-        action='store_true',
-        default=False,
-        help='Whether start pserver in async mode to support ASGD')
-    parser.add_argument(
-        '--use_reader_op',
-        action='store_true',
-        help='Whether to use reader op, and must specify the data path if set this to true.'
-    )
-    parser.add_argument(
-        '--data_path',
-        type=str,
-        default="",
-        help='Directory that contains all the training recordio files.')
-    parser.add_argument(
-        '--test_data_path',
-        type=str,
-        default="",
-        help='Directory that contains all the test data (NOT recordio).')
-    parser.add_argument(
-        '--use_inference_transpiler',
-        action='store_true',
-        help='If set, use inference transpiler to optimize the program.')
-    parser.add_argument(
-        '--no_random',
-        action='store_true',
-        help='If set, keep the random seed and do not shuffle the data.')
-    parser.add_argument(
-        '--reduce_strategy',
-        type=str,
-        choices=['reduce', 'all_reduce'],
-        default='all_reduce',
-        help='Specify the reduce strategy, can be reduce, all_reduce')
-    parser.add_argument(
-        '--fuse_broadcast_op',
-        action='store_true',
-        help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.'
-    )
-    args = parser.parse_args()
-    return args
--- a/benchmark/fluid/check_env.sh
+++ b/benchmark/fluid/check_env.sh
-#!/bin/bash
-
-if [ "`uname -s`" != "Linux" ]; then
-  echo "Current scenario only support in Linux yet!"
-  exit 0
-fi
-
-echo "========================= Hardware Information ========================="
-sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
-cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
-ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
-physical_cores=$((sockets * cores_per_socket))
-virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
-numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
-echo "CPU Name               : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
-echo "CPU Family             : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
-echo "Socket Number          : $sockets"
-echo "Cores Per Socket       : $cores_per_socket"
-echo "Total Physical Cores   : $physical_cores"
-echo "Total Virtual Cores    : $virtual_cores"
-if [ $ht -eq 1 ]; then
-  echo "Hyper Threading        : OFF"
-  if [ $physical_cores -ne $virtual_cores ]; then
-    echo "Error: HT logical error"
-  fi
-else
-  echo "Hyper Threading        : ON"
-  if [ $physical_cores -ge $virtual_cores ]; then
-    echo "Error: HT logical error"
-  fi
-fi
-echo "NUMA Nodes             : $numa_nodes"
-if [ $numa_nodes -lt $sockets ]; then
-  echo "Warning: NUMA node is not enough for the best performance,\
- at least $sockets"
-fi
-
-echo "-------------------------- Memory Information --------------------------"
-# dmidecode support start from 2.11
-dmi_ver=`dmidecode --version|awk -F '.' '{print $1}'|xargs`
-if [ $dmi_ver -lt 2 ]; then
-  echo "Error: dmidecode unknown or version is too old"
-  exit 0
-fi
-if [ `dmidecode | grep -ic "Permission denied"` -ne 0 ]; then
-  echo "Error: need root to run dmidecode"
-  exit 0
-fi
-max_dimms=0
-num_dimms_installed=0
-for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do
-  num_refered=`dmidecode |grep -wc "$dimm_id"`
-  # the actual dimm id should be refered only once
-  if [ $num_refered -eq 1 ]; then
-    num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1; f=0};
-      /Unknown/ {f=1};
-      /Manufacturer/ {if (s==1) {print f; exit 0;}};'`
-    if [ $num_unknown -eq 0 ]; then
-      dimms_installed="$dimms_installed \n $dimm_id"
-      ((num_dimms_installed++))
-    else
-      dimms_uninstalled="$dimms_uninstalled \n $dimm_id"
-    fi
-    ((max_dimms++))
-  fi
-done
-echo "Installed DIMM number  : $num_dimms_installed"
-num_dimms_mapped=`dmidecode | grep "Memory Device Mapped" | wc -l`
-if [ $num_dimms_installed -ne $num_dimms_mapped ]; then
-  echo "Error: The installed DIMMs number does ont match the mapped memory device: $num_dimms_mapped"
-fi
-num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -ic "Hz"`
-if [ $num_dimms_installed -ne $num_clock_configed ]; then
-  echo "Error: The installed DIMMs number does ont match configured clocks: $num_clock_configed"
-fi
-echo -e "Installed DIMMs Locator: $dimms_installed"
-echo -e "Not installed DIMMs    : $dimms_uninstalled"
-max_dimm_slots=`dmidecode | grep -c "Bank Locator"`
-echo "DIMMs max slots        : $max_dimm_slots"
-if [ $max_dimms -ne $max_dimm_slots ]; then
-  echo "Error: The max dimm slots do not match the max dimms: $max_dimms"
-fi
-free_ver_main=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $1}'`
-free_ver_sub=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $2}'`
-if [ $free_ver_main -lt 3 ] || [ $free_ver_sub -lt 3 ]; then
-  mem_sz=`free |grep -i mem |awk -F' ' '{print $2}'|xargs`
-  swap_sz=`free |grep -i swap |awk -F' ' '{print $2}'|xargs`
-  total_sz=`free -t |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
-  mem_sz="`awk 'BEGIN{printf "%.1f\n",('$mem_sz'/1024/1024)}'` GB" 
-  swap_sz="`awk 'BEGIN{printf "%.1f\n",('$swap_sz'/1024/1024)}'` GB"
-  total_sz="`awk 'BEGIN{printf "%.1f\n",('$total_sz'/1024/1024)}'` GB"
-else
-  mem_sz=`free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`
-  swap_sz=`free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`
-  total_sz=`free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
-fi
-echo "Memory Size            : $mem_sz"
-echo "Swap Memory Size       : $swap_sz"
-echo "Total Memory Size      : $total_sz"
-echo "Max Memory Capacity    : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`"
-# DIMMs fequency
-clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs`
-echo "Configed Clock Speed   : $clock_speeds"
-num_clock_type=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | wc -l`
-if [ $num_clock_type -ne 1 ]; then
-  echo "Warning: Have more than 1 speed type, all DIMMs should have same fequency: $clock_speeds"
-fi
-
-echo "-------------------------- Turbo Information  --------------------------"
-scaling_drive=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver`
-echo "Scaling Driver         : $scaling_drive"
-if [ $scaling_drive == "intel_pstate" ] && [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
-  turbo=`cat /sys/devices/system/cpu/intel_pstate/no_turbo`
-  if [ $turbo -eq 1 ]; then
-    echo "Turbo Status           : OFF"
-  else
-    echo "Turbo Status           : ON"
-  fi
-else
-  echo "Warning: Scaling driver is not intel_pstarte, maybe should enable it in BIOS"
-  echo "Turbo Status           : Unknown"
-fi
-# cpu frequency
-num_max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| sort -u |wc -l`
-num_min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| sort -u |wc -l`
-if [ $num_max_freq -ne 1 ]; then
-  echo "Error: the max_frequency of all CPU should be equal"
-fi
-if [ $num_min_freq -ne 1 ]; then
-  echo "Error: the min_frequency of all CPU should be equal"
-fi
-max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| uniq|xargs` # kHz
-max_freq=`awk 'BEGIN{printf "%.2f",('$max_freq' / 1000000)}'` # GHz
-min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| uniq|xargs` # kHz
-min_freq=`awk 'BEGIN{printf "%.2f",('$min_freq' / 1000000)}'` # GHz
-echo "CPU Max Frequency      : $max_freq GHz"
-echo "CPU Min Frequency      : $min_freq GHz"
-# cpu governor
-num_governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |wc -l`
-if [ $num_governor -ne 1 ]; then
-  echo "Error: the governor of all CPU should be the same"
-fi
-governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |uniq`
-echo "CPU Freq Governor      : $governor"
-
-
-echo "========================= Software Information ========================="
-echo "BIOS Release Date      : `dmidecode | grep "Release Date"|awk -F ':' '{print $2}'|xargs`"
-echo "OS Version             : `cat /etc/redhat-release`"
-echo "Kernel Release Version : `uname -r`"
-echo "Kernel Patch Version   : `uname -v`"
-echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
-if command -v cmake >/dev/null 2>&1; then 
-  cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
-else
-  cmake_ver=" Not installed"
-fi
-echo "CMake Version          :$cmake_ver"
-echo "------------------ Environment Variables Information -------------------"
-kmp_affinity=`env | grep KMP_AFFINITY`
-omp_dynamic=`env | grep OMP_DYNAMIC`
-omp_nested=`env | grep OMP_NESTED`
-omp_num_threads=`env | grep OMP_NUM_THREADS`
-mkl_num_threads=`env | grep MKL_NUM_THREADS`
-mkl_dynamic=`env | grep MKL_DYNAMIC`
-if [ ! $kmp_affinity ]; then kmp_affinity="unset"; fi
-if [ ! $omp_dynamic ]; then omp_dynamic="unset"; fi
-if [ ! $omp_nested ]; then omp_nested="unset"; fi
-if [ ! $omp_num_threads ]; then omp_num_threads="unset"; fi
-if [ ! $mkl_num_threads ]; then mkl_num_threads="unset"; fi
-if [ ! $mkl_dynamic ]; then mkl_dynamic="unset"; fi
-echo "KMP_AFFINITY           : $kmp_affinity"
-echo "OMP_DYNAMIC            : $omp_dynamic"
-echo "OMP_NESTED             : $omp_nested"
-echo "OMP_NUM_THREADS        : $omp_num_threads"
-echo "MKL_NUM_THREADS        : $mkl_num_threads"
-echo "MKL_DYNAMIC            : $mkl_dynamic"
-# Check if any MKL related libraries have been installed in LD_LIBRARY_PATH
-for path in `echo $LD_LIBRARY_PATH | awk -F ':' '{for(i=1;i<=NF;++i)print $i}'`; do
-  mkldnn_found=`find $path -name "libmkldnn.so"`
-  if [ "$mkldnn_found" ]; then
-    echo "Found MKL-DNN          : $mkldnn_found"
-  fi
-  mklml_found=`find $path -name "libmklml_intel.so"`
-  if [ "$mklml_found" ]; then
-    echo "Found MKLML            : $mklml_found"
-  fi
-  iomp_found=`find $path -name "libiomp5.so"`
-  if [ "$iomp_found" ]; then
-    echo "Found IOMP             : $iomp_found"
-  fi
-done
-
-# dump all details for fully check
-lscpu > lscpu.dump
-dmidecode > dmidecode.dump
-
-# The expected result would be like:
-# ========================= Hardware Information =========================
-# CPU Name               : Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
-# CPU Family             : 6
-# Socket Number          : 2
-# Cores Per Socket       : 20
-# Total Physical Cores   : 40
-# Total Virtual Cores    : 40
-# Hyper Threading        : OFF
-# NUMA Nodes             : 2
-# -------------------------- Memory Information --------------------------
-# Installed DIMM number  : 12
-# Installed DIMMs Locator:
-#  CPU1_DIMM_A1
-#  CPU1_DIMM_B1
-#  CPU1_DIMM_C1
-#  CPU1_DIMM_D1
-#  CPU1_DIMM_E1
-#  CPU1_DIMM_F1
-#  CPU2_DIMM_A1
-#  CPU2_DIMM_B1
-#  CPU2_DIMM_C1
-#  CPU2_DIMM_D1
-#  CPU2_DIMM_E1
-#  CPU2_DIMM_F1
-# Not installed DIMMs    :
-#  CPU1_DIMM_A2
-#  CPU1_DIMM_B2
-#  CPU1_DIMM_C2
-#  CPU1_DIMM_D2
-#  CPU1_DIMM_E2
-#  CPU1_DIMM_F2
-#  CPU2_DIMM_A2
-#  CPU2_DIMM_B2
-#  CPU2_DIMM_C2
-#  CPU2_DIMM_D2
-#  CPU2_DIMM_E2
-#  CPU2_DIMM_F2
-# DIMMs max slots        : 24
-# Memory Size            : 376G
-# Swap Memory Size       : 4.0G
-# Total Memory Size      : 380G
-# Max Memory Capacity    : 2304 GB
-# Configed Clock Speed   : 2666 MHz
-# -------------------------- Turbo Information  --------------------------
-# Scaling Driver         : intel_pstate
-# Turbo Status           : ON
-# CPU Max Frequency      : 3.70 GHz
-# CPU Min Frequency      : 1.00 GHz
-# CPU Freq Governor      : performance
-# ========================= Software Information =========================
-# BIOS Release Date      : 03/10/2017
-# OS Version             : CentOS Linux release 7.3.1611 (Core)
-# Kernel Release Version : 3.10.0-514.el7.x86_64
-# Kernel Patch Version   : #1 SMP Tue Nov 22 16:42:41 UTC 2016
-# GCC Version            : 4.8.5 20150623 (Red Hat 4.8.5-11)
-# CMake Version          : 3.5.2
-# ------------------ Environment Variables Information -------------------
-# KMP_AFFINITY           : unset
-# OMP_DYNAMIC            : unset
-# OMP_NESTED             : unset
-# OMP_NUM_THREADS        : unset
-# MKL_NUM_THREADS        : unset
-# MKL_DYNAMIC            : unset
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import cProfile
-import time
-import os
-import traceback
-
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
-
-from args import *
-
-
-def append_nccl2_prepare(trainer_id, startup_prog):
-    if trainer_id >= 0:
-        # append gen_nccl_id at the end of startup program
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        port = os.getenv("PADDLE_PSERVER_PORT")
-        worker_ips = os.getenv("PADDLE_TRAINER_IPS")
-        worker_endpoints = []
-        for ip in worker_ips.split(","):
-            worker_endpoints.append(':'.join([ip, port]))
-        num_trainers = len(worker_endpoints)
-        current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
-        worker_endpoints.remove(current_endpoint)
-
-        nccl_id_var = startup_prog.global_block().create_var(
-            name="NCCLID",
-            persistable=True,
-            type=fluid.core.VarDesc.VarType.RAW)
-        startup_prog.global_block().append_op(
-            type="gen_nccl_id",
-            inputs={},
-            outputs={"NCCLID": nccl_id_var},
-            attrs={
-                "endpoint": current_endpoint,
-                "endpoint_list": worker_endpoints,
-                "trainer_id": trainer_id
-            })
-        return nccl_id_var, num_trainers, trainer_id
-    else:
-        raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
-                        "nccl-based dist train.")
-
-
-def dist_transpile(trainer_id, args, train_prog, startup_prog):
-    if trainer_id < 0:
-        return None, None
-
-    # the port of all pservers, needed by both trainer and pserver
-    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-    # comma separated ips of all pservers, needed by trainer and
-    # pserver
-    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
-    eplist = []
-    for ip in pserver_ips.split(","):
-        eplist.append(':'.join([ip, port]))
-    pserver_endpoints = ",".join(eplist)
-    # total number of workers/trainers in the job, needed by
-    # trainer and pserver
-    trainers = int(os.getenv("PADDLE_TRAINERS"))
-    # the IP of the local machine, needed by pserver only
-    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
-    # the role, should be either PSERVER or TRAINER
-    training_role = os.getenv("PADDLE_TRAINING_ROLE")
-
-    config = fluid.DistributeTranspilerConfig()
-    config.slice_var_up = not args.no_split_var
-    config.min_block_size = 1048576
-    t = distribute_transpiler.DistributeTranspiler(config=config)
-
-    t.transpile(
-        trainer_id,
-        # NOTE: *MUST* use train_prog, for we are using with guard to
-        # generate different program for train and test.
-        program=train_prog,
-        pservers=pserver_endpoints,
-        trainers=trainers,
-        sync_mode=not args.async_mode,
-        startup_program=startup_prog)
-    if training_role == "PSERVER":
-        pserver_program = t.get_pserver_program(current_endpoint)
-        pserver_startup_program = t.get_startup_program(
-            current_endpoint, pserver_program, startup_program=startup_prog)
-        return pserver_program, pserver_startup_program
-    elif training_role == "TRAINER":
-        train_program = t.get_trainer_program()
-        return train_program, startup_prog
-    else:
-        raise ValueError(
-            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
-        )
-
-
-def test_parallel(exe, test_args, args, test_prog, feeder):
-    acc_evaluators = []
-    for i in xrange(len(test_args[2])):
-        acc_evaluators.append(fluid.metrics.Accuracy())
-
-    to_fetch = [v.name for v in test_args[2]]
-    if args.use_reader_op:
-        test_args[4].start()
-        while True:
-            try:
-                acc_rets = exe.run(fetch_list=to_fetch)
-                for i, e in enumerate(acc_evaluators):
-                    e.update(
-                        value=np.array(acc_rets[i]), weight=args.batch_size)
-            except fluid.core.EOFException as eof:
-                test_args[4].reset()
-                break
-    else:
-        for batch_id, data in enumerate(test_args[3]()):
-            acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch)
-            for i, e in enumerate(acc_evaluators):
-                e.update(value=np.array(acc_rets[i]), weight=len(data))
-
-    return [e.eval() for e in acc_evaluators]
-
-
-# NOTE: only need to benchmark using parallelexe
-def train_parallel(train_args, test_args, args, train_prog, test_prog,
-                   startup_prog, nccl_id_var, num_trainers, trainer_id):
-    over_all_start = time.time()
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    feeder = None
-    if not args.use_reader_op:
-        feed_var_list = [
-            var for var in train_prog.global_block().vars.itervalues()
-            if var.is_data
-        ]
-        feeder = fluid.DataFeeder(feed_var_list, place)
-    # generate fake:
-    if args.use_fake_data:
-        for var in feed_var_list:
-            v = startup_prog.global_block()._clone_variable(var)
-            var.persistable = True
-            v.persistable = True
-
-            real_shape = list(var.shape)
-            real_shape[0] = args.batch_size / args.gpus
-            startup_prog.global_block().append_op(
-                outputs={"Out": v},
-                type="fill_constant",
-                attrs={"shape": real_shape,
-                       "value": 1.0,
-                       "dtype": var.dtype})
-
-    if nccl_id_var and trainer_id == 0:
-        #FIXME(wuyi): wait other trainer to start listening
-        time.sleep(30)
-
-    startup_exe = fluid.Executor(place)
-    startup_exe.run(startup_prog)
-    strategy = fluid.ExecutionStrategy()
-    strategy.num_threads = args.cpus
-    strategy.allow_op_delay = False
-    build_strategy = fluid.BuildStrategy()
-    if args.reduce_strategy == "reduce":
-        build_strategy.reduce_strategy = fluid.BuildStrategy(
-        ).ReduceStrategy.Reduce
-    else:
-        build_strategy.reduce_strategy = fluid.BuildStrategy(
-        ).ReduceStrategy.AllReduce
-
-    avg_loss = train_args[0]
-
-    if args.update_method == "pserver":
-        # parameter server mode distributed training, merge
-        # gradients on local server, do not initialize
-        # ParallelExecutor with multi server all-reduce mode.
-        num_trainers = 1
-        trainer_id = 0
-
-    exe = fluid.ParallelExecutor(
-        True,
-        avg_loss.name,
-        main_program=train_prog,
-        exec_strategy=strategy,
-        build_strategy=build_strategy,
-        num_trainers=num_trainers,
-        trainer_id=trainer_id)
-
-    if not args.no_test:
-        if args.update_method == "pserver":
-            test_scope = None
-        else:
-            # NOTE: use an empty scope to avoid test exe using NCCLID
-            test_scope = fluid.Scope()
-        test_exe = fluid.ParallelExecutor(
-            True, main_program=test_prog, share_vars_from=exe)
-
-    for pass_id in range(args.pass_num):
-        num_samples = 0
-        iters = 0
-        start_time = time.time()
-        if not args.use_reader_op:
-            reader_generator = train_args[3]()  #train_reader
-        batch_id = 0
-        data = None
-        if args.use_reader_op:
-            train_args[4].start()
-        while True:
-            if not args.use_reader_op:
-                data = next(reader_generator, None)
-                if data == None:
-                    break
-            if args.profile and batch_id == 5:
-                profiler.start_profiler("All")
-                profiler.reset_profiler()
-            elif args.profile and batch_id == 10:
-                print("profiling total time: ", time.time() - start_time)
-                profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" %
-                                       (trainer_id, pass_id))
-            if iters == args.iterations:
-                reader_generator.close()
-                break
-
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            fetch_list = [avg_loss.name]
-            acc_name_list = [v.name for v in train_args[2]]
-            fetch_list.extend(acc_name_list)
-
-            if args.use_fake_data or args.use_reader_op:
-                try:
-                    fetch_ret = exe.run(fetch_list)
-                except fluid.core.EOFException as eof:
-                    break
-                except fluid.core.EnforceNotMet as ex:
-                    traceback.print_exc()
-                    break
-            else:
-                fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
-            if args.use_reader_op:
-                num_samples += args.batch_size * args.gpus
-            else:
-                num_samples += len(data)
-
-            iters += 1
-            if batch_id % 1 == 0:
-                fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
-                print("Pass %d, batch %d, loss %s, accucacys: %s" %
-                      (pass_id, batch_id, fetched_data[0], fetched_data[1:]))
-            batch_id += 1
-
-        print_train_time(start_time, time.time(), num_samples)
-        if args.use_reader_op:
-            train_args[4].reset()  # reset reader handle
-        else:
-            del reader_generator
-
-        if not args.no_test and test_args[2]:
-            test_feeder = None
-            if not args.use_reader_op:
-                test_feed_var_list = [
-                    var for var in test_prog.global_block().vars.itervalues()
-                    if var.is_data
-                ]
-                test_feeder = fluid.DataFeeder(test_feed_var_list, place)
-            test_ret = test_parallel(test_exe, test_args, args, test_prog,
-                                     test_feeder)
-            print("Pass: %d, Test Accuracy: %s\n" %
-                  (pass_id, [np.mean(np.array(v)) for v in test_ret]))
-
-    print("total train time: ", time.time() - over_all_start)
-
-
-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('----------- Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-def print_train_time(start_time, end_time, num_samples):
-    train_elapsed = end_time - start_time
-    examples_per_sec = num_samples / train_elapsed
-    print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-          (num_samples, train_elapsed, examples_per_sec))
-
-
-def print_paddle_envs():
-    print('----------- Configuration envs -----------')
-    for k in os.environ:
-        if "PADDLE_" in k:
-            print "ENV %s:%s" % (k, os.environ[k])
-    print('------------------------------------------------')
-
-
-def main():
-    args = parse_args()
-    print_arguments(args)
-    print_paddle_envs()
-    if args.no_random:
-        fluid.default_startup_program().random_seed = 1
-
-    # the unique trainer id, starting from 0, needed by trainer
-    # only
-    nccl_id_var, num_trainers, trainer_id = (
-        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
-
-    if args.use_cprof:
-        pr = cProfile.Profile()
-        pr.enable()
-
-    model_def = __import__("models.%s" % args.model, fromlist=["models"])
-
-    train_prog = fluid.Program()
-    test_prog = fluid.Program()
-    startup_prog = fluid.Program()
-
-    train_args = list(model_def.get_model(args, True, train_prog, startup_prog))
-    test_args = list(model_def.get_model(args, False, test_prog, startup_prog))
-
-    all_args = [train_args, test_args, args]
-
-    if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog,
-                                                  startup_prog)
-        if not train_prog:
-            raise Exception(
-                "Must configure correct environments to run dist train.")
-        all_args.extend([train_prog, test_prog, startup_prog])
-        if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
-            all_args.extend([nccl_id_var, num_trainers, trainer_id])
-            train_parallel(*all_args)
-        elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
-            # start pserver with Executor
-            server_exe = fluid.Executor(fluid.CPUPlace())
-            server_exe.run(startup_prog)
-            server_exe.run(train_prog)
-        exit(0)
-
-    # for other update methods, use default programs
-    all_args.extend([train_prog, test_prog, startup_prog])
-
-    if args.update_method == "nccl2":
-        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
-            trainer_id, startup_prog)
-
-    if args.device == "CPU":
-        raise Exception("Only support GPU perf with parallel exe")
-    all_args.extend([nccl_id_var, num_trainers, trainer_id])
-    train_parallel(*all_args)
-
-
-if __name__ == "__main__":
-    main()
--- a/benchmark/fluid/imagenet_reader.py
+++ b/benchmark/fluid/imagenet_reader.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import random
-import functools
-import numpy as np
-from threading import Thread
-import subprocess
-import time
-
-from Queue import Queue
-import paddle
-from PIL import Image, ImageEnhance
-
-random.seed(0)
-
-DATA_DIM = 224
-
-THREAD = int(os.getenv("PREPROCESS_THREADS", "10"))
-BUF_SIZE = 5120
-
-DATA_DIR = '/mnt/ImageNet'
-TRAIN_LIST = '/mnt/ImageNet/train.txt'
-TEST_LIST = '/mnt/ImageNet/val.txt'
-
-img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-
-
-def resize_short(img, target_size):
-    percent = float(target_size) / min(img.size[0], img.size[1])
-    resized_width = int(round(img.size[0] * percent))
-    resized_height = int(round(img.size[1] * percent))
-    img = img.resize((resized_width, resized_height), Image.LANCZOS)
-    return img
-
-
-def crop_image(img, target_size, center):
-    width, height = img.size
-    size = target_size
-    if center == True:
-        w_start = (width - size) / 2
-        h_start = (height - size) / 2
-    else:
-        w_start = random.randint(0, width - size)
-        h_start = random.randint(0, height - size)
-    w_end = w_start + size
-    h_end = h_start + size
-    img = img.crop((w_start, h_start, w_end, h_end))
-    return img
-
-
-def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
-    aspect_ratio = math.sqrt(random.uniform(*ratio))
-    w = 1. * aspect_ratio
-    h = 1. / aspect_ratio
-
-    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
-                (float(img.size[1]) / img.size[0]) / (h**2))
-    scale_max = min(scale[1], bound)
-    scale_min = min(scale[0], bound)
-
-    target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
-                                                             scale_max)
-    target_size = math.sqrt(target_area)
-    w = int(target_size * w)
-    h = int(target_size * h)
-
-    i = random.randint(0, img.size[0] - w)
-    j = random.randint(0, img.size[1] - h)
-
-    img = img.crop((i, j, i + w, j + h))
-    img = img.resize((size, size), Image.LANCZOS)
-    return img
-
-
-def rotate_image(img):
-    angle = random.randint(-10, 10)
-    img = img.rotate(angle)
-    return img
-
-
-def distort_color(img):
-    def random_brightness(img, lower=0.5, upper=1.5):
-        e = random.uniform(lower, upper)
-        return ImageEnhance.Brightness(img).enhance(e)
-
-    def random_contrast(img, lower=0.5, upper=1.5):
-        e = random.uniform(lower, upper)
-        return ImageEnhance.Contrast(img).enhance(e)
-
-    def random_color(img, lower=0.5, upper=1.5):
-        e = random.uniform(lower, upper)
-        return ImageEnhance.Color(img).enhance(e)
-
-    ops = [random_brightness, random_contrast, random_color]
-    random.shuffle(ops)
-
-    img = ops[0](img)
-    img = ops[1](img)
-    img = ops[2](img)
-
-    return img
-
-
-def process_image(sample, mode, color_jitter, rotate):
-    img_path = sample[0]
-
-    img = Image.open(img_path)
-    if mode == 'train':
-        if rotate: img = rotate_image(img)
-        img = random_crop(img, DATA_DIM)
-    else:
-        img = resize_short(img, target_size=256)
-        img = crop_image(img, target_size=DATA_DIM, center=True)
-    if mode == 'train':
-        if color_jitter:
-            img = distort_color(img)
-        if random.randint(0, 1) == 1:
-            img = img.transpose(Image.FLIP_LEFT_RIGHT)
-
-    if img.mode != 'RGB':
-        img = img.convert('RGB')
-
-    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
-    img -= img_mean
-    img /= img_std
-
-    if mode == 'train' or mode == 'val':
-        return img, sample[1]
-    elif mode == 'test':
-        return [img]
-
-
-class XmapEndSignal():
-    pass
-
-
-def xmap_readers(mapper,
-                 reader,
-                 process_num,
-                 buffer_size,
-                 order=False,
-                 print_queue_state=True):
-    end = XmapEndSignal()
-
-    # define a worker to read samples from reader to in_queue
-    def read_worker(reader, in_queue):
-        for i in reader():
-            in_queue.put(i)
-        in_queue.put(end)
-
-    # define a worker to read samples from reader to in_queue with order flag
-    def order_read_worker(reader, in_queue, file_queue):
-        in_order = 0
-        for i in reader():
-            in_queue.put((in_order, i))
-            in_order += 1
-        in_queue.put(end)
-
-    # define a worker to handle samples from in_queue by mapper
-    # and put mapped samples into out_queue
-    def handle_worker(in_queue, out_queue, mapper):
-        sample = in_queue.get()
-        while not isinstance(sample, XmapEndSignal):
-            r = mapper(sample)
-            out_queue.put(r)
-            sample = in_queue.get()
-        in_queue.put(end)
-        out_queue.put(end)
-
-    # define a worker to handle samples from in_queue by mapper
-    # and put mapped samples into out_queue by order
-    def order_handle_worker(in_queue, out_queue, mapper, out_order):
-        ins = in_queue.get()
-        while not isinstance(ins, XmapEndSignal):
-            order, sample = ins
-            r = mapper(sample)
-            while order != out_order[0]:
-                pass
-            out_queue.put(r)
-            out_order[0] += 1
-            ins = in_queue.get()
-        in_queue.put(end)
-        out_queue.put(end)
-
-    def xreader():
-        file_queue = Queue()
-        in_queue = Queue(buffer_size)
-        out_queue = Queue(buffer_size)
-        out_order = [0]
-        # start a read worker in a thread
-        target = order_read_worker if order else read_worker
-        t = Thread(target=target, args=(reader, in_queue))
-        t.daemon = True
-        t.start()
-        # start several handle_workers
-        target = order_handle_worker if order else handle_worker
-        args = (in_queue, out_queue, mapper, out_order) if order else (
-            in_queue, out_queue, mapper)
-        workers = []
-        for i in xrange(process_num):
-            worker = Thread(target=target, args=args)
-            worker.daemon = True
-            workers.append(worker)
-        for w in workers:
-            w.start()
-
-        sample = out_queue.get()
-        start_t = time.time()
-        while not isinstance(sample, XmapEndSignal):
-            yield sample
-            sample = out_queue.get()
-            if time.time() - start_t > 3:
-                if print_queue_state:
-                    print("queue sizes: ", in_queue.qsize(), out_queue.qsize())
-                start_t = time.time()
-        finish = 1
-        while finish < process_num:
-            sample = out_queue.get()
-            if isinstance(sample, XmapEndSignal):
-                finish += 1
-            else:
-                yield sample
-
-    return xreader
-
-
-def _reader_creator(file_list,
-                    mode,
-                    shuffle=False,
-                    color_jitter=False,
-                    rotate=False,
-                    xmap=True):
-    def reader():
-        with open(file_list) as flist:
-            full_lines = [line.strip() for line in flist]
-            if shuffle:
-                random.shuffle(full_lines)
-            if mode == 'train':
-                trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-                trainer_count = int(os.getenv("PADDLE_TRAINERS"))
-                per_node_lines = len(full_lines) / trainer_count
-                lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
-                                   * per_node_lines]
-                print(
-                    "read images from %d, length: %d, lines length: %d, total: %d"
-                    % (trainer_id * per_node_lines, per_node_lines, len(lines),
-                       len(full_lines)))
-            else:
-                lines = full_lines
-
-            for line in lines:
-                if mode == 'train':
-                    img_path, label = line.split()
-                    img_path = img_path.replace("JPEG", "jpeg")
-                    img_path = os.path.join(DATA_DIR, "train", img_path)
-                    yield (img_path, int(label))
-                elif mode == 'val':
-                    img_path, label = line.split()
-                    img_path = img_path.replace("JPEG", "jpeg")
-                    img_path = os.path.join(DATA_DIR, "val", img_path)
-                    yield (img_path, int(label))
-                elif mode == 'test':
-                    img_path = os.path.join(DATA_DIR, line)
-                    yield [img_path]
-
-    mapper = functools.partial(
-        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
-
-    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
-
-
-def load_raw_image_uint8(sample):
-    img_arr = np.array(Image.open(sample[0])).astype('int64')
-    return img_arr, int(sample[1])
-
-
-def train_raw(file_list=TRAIN_LIST, shuffle=True):
-    def reader():
-        with open(file_list) as flist:
-            full_lines = [line.strip() for line in flist]
-            if shuffle:
-                random.shuffle(full_lines)
-
-            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-            trainer_count = int(os.getenv("PADDLE_TRAINERS"))
-            per_node_lines = len(full_lines) / trainer_count
-            lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) *
-                               per_node_lines]
-            print("read images from %d, length: %d, lines length: %d, total: %d"
-                  % (trainer_id * per_node_lines, per_node_lines, len(lines),
-                     len(full_lines)))
-
-            for line in lines:
-                img_path, label = line.split()
-                img_path = img_path.replace("JPEG", "jpeg")
-                img_path = os.path.join(DATA_DIR, "train", img_path)
-                yield (img_path, int(label))
-
-    return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD,
-                                      BUF_SIZE)
-
-
-def train(file_list=TRAIN_LIST, xmap=True):
-    return _reader_creator(
-        file_list,
-        'train',
-        shuffle=True,
-        color_jitter=False,
-        rotate=False,
-        xmap=xmap)
-
-
-def val(file_list=TEST_LIST, xmap=True):
-    return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap)
-
-
-def test(file_list=TEST_LIST):
-    return _reader_creator(file_list, 'test', shuffle=False)
-
-
-if __name__ == "__main__":
-    c = 0
-    start_t = time.time()
-    for d in train()():
-        c += 1
-        if c >= 10000:
-            break
-    spent = time.time() - start_t
-    print("read 10000 speed: ", 10000 / spent, spent)
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import yaml
-import copy
-import argparse
-import random
-import os
-import copy
-from kube_templates import pserver, trainer, envs
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Generate dist job yamls.')
-
-    parser.add_argument(
-        '--jobname', default="paddlejob", help='unique job name')
-    parser.add_argument(
-        '--cpu', default=1, type=int, help='CPU cores per trainer node')
-    parser.add_argument(
-        '--pscpu', default=1, type=int, help='CPU cores per pserver node')
-    parser.add_argument(
-        '--gpu', default=0, type=int, help='num of GPUs per node')
-    parser.add_argument(
-        '--image',
-        default="bootstrapper:5000/fluid_benchmark:gpu",
-        help='num of GPUs per node')
-    parser.add_argument(
-        '--pservers', default=1, type=int, help='num of pservers')
-    parser.add_argument(
-        '--trainers', default=1, type=int, help='num of trainers')
-    parser.add_argument('--memory', default=1, type=int, help='trainer memory')
-    parser.add_argument(
-        '--psmemory', default=1, type=int, help='pserver memory')
-    parser.add_argument(
-        '--port', default=30236, type=int, help='num of trainers')
-    parser.add_argument(
-        '--entry', default="python train.py", help='command to run')
-    parser.add_argument(
-        '--fluid', default=1, type=int, help='whether is fluid job')
-    parser.add_argument(
-        '--rdma', action='store_true', help='whether mount rdma libs')
-    parser.add_argument(
-        '--disttype',
-        default="pserver",
-        type=str,
-        choices=['pserver', 'nccl2', 'local'],
-        help='pserver or nccl2 or local')
-
-    args = parser.parse_args()
-    return args
-
-
-def gen_job():
-    ps = pserver
-    tn = trainer
-    args = parse_args()
-
-    ps_container = ps["spec"]["template"]["spec"]["containers"][0]
-    tn_container = tn["spec"]["template"]["spec"]["containers"][0]
-
-    if args.fluid == 1:
-        ps_container["command"] = \
-            ["paddle_k8s", "start_fluid"]
-        tn_container["command"] = \
-            ["paddle_k8s", "start_fluid"]
-    ps["metadata"]["name"] = args.jobname + "-pserver"
-    ps["spec"]["template"]["metadata"]["labels"][
-        "paddle-job-pserver"] = args.jobname
-    tn["metadata"]["name"] = args.jobname + "-trainer"
-    tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname
-
-    ps_container["image"] = args.image
-    tn_container["image"] = args.image
-
-    ps_container["resources"]["requests"]["cpu"] = str(args.pscpu)
-    ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi"
-    ps_container["resources"]["limits"]["cpu"] = str(args.pscpu)
-    ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi"
-
-    tn_container["resources"]["requests"]["cpu"] = str(args.cpu)
-    tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi"
-    tn_container["resources"]["limits"]["cpu"] = str(args.cpu)
-    tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi"
-    if args.gpu > 0:
-        tn_container["resources"]["requests"][
-            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
-        tn_container["resources"]["limits"][
-            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
-
-    ps["spec"]["replicas"] = int(args.pservers)
-    tn["spec"]["parallelism"] = int(args.trainers)
-    tn["spec"]["completions"] = int(args.trainers)
-    ps_container["ports"][0]["name"] = "jobport-" + str(args.port)
-    ps_container["ports"][0]["containerPort"] = args.port
-    spreadport = random.randint(40000, 60000)
-    tn_container["ports"][0]["name"] = "spr-" + str(spreadport)
-    tn_container["ports"][0]["containerPort"] = spreadport
-
-    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
-    envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
-    envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
-    envs.append({"name": "ENTRY", "value": args.entry})
-    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
-    # NOTE: these directories below are cluster specific, please modify
-    # this settings before you run on your own cluster.
-    envs.append({
-        "name": "LD_LIBRARY_PATH",
-        "value":
-        "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind"
-    })
-
-    volumes = [{
-        "name": "nvidia-driver",
-        "hostPath": {
-            "path": "/usr/local/nvidia/lib64"
-        }
-    }]
-    volumeMounts = [{
-        "mountPath": "/usr/local/nvidia/lib64",
-        "name": "nvidia-driver"
-    }]
-
-    if args.rdma:
-        volumes.extend([{
-            "name": "ibetc",
-            "hostPath": {
-                "path": "/etc/libibverbs.d"
-            }
-        }, {
-            "name": "iblibs",
-            "hostPath": {
-                "path": "/usr/local/rdma"
-            }
-        }, {
-            "name": "valgrind",
-            "hostPath": {
-                "path": "/usr/lib64/mlnx_ofed/valgrind"
-            }
-        }])
-        volumeMounts.extend([{
-            "mountPath": "/etc/libibverbs.d",
-            "name": "ibetc"
-        }, {
-            "mountPath": "/usr/local/rdma",
-            "name": "iblibs"
-        }, {
-            "mountPath": "/usr/lib64/mlnx_ofed/valgrind",
-            "name": "valgrind"
-        }])
-        # append shm for NCCL2
-        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
-        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
-
-    # add ceph volumes
-    volumes.append({
-        "name": "ceph-data",
-        "cephfs": {
-            "monitors": ["192.168.16.23:6789"],
-            "secretRef": {
-                "name": "ceph-secret"
-            },
-            "user": "admin",
-        }
-    })
-    volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
-
-    tn["spec"]["template"]["spec"]["volumes"] = volumes
-    tn_container["volumeMounts"] = volumeMounts
-
-    ps_container["env"] = copy.deepcopy(envs)
-    ps_container["env"].append({
-        "name": "PADDLE_TRAINING_ROLE",
-        "value": "PSERVER"
-    })
-    tn_container["env"] = envs
-    if args.disttype == "pserver":
-        tn_container["env"].append({
-            "name": "PADDLE_TRAINING_ROLE",
-            "value": "TRAINER"
-        })
-    elif args.disttype == "nccl2" or args.disttype == "local":
-        # NCCL2 have no training role, set to plain WORKER
-        tn_container["env"].append({
-            "name": "PADDLE_TRAINING_ROLE",
-            "value": "WORKER"
-        })
-
-    os.mkdir(args.jobname)
-    if args.disttype == "pserver":
-        with open("%s/pserver.yaml" % args.jobname, "w") as fn:
-            yaml.dump(ps, fn)
-
-    with open("%s/trainer.yaml" % args.jobname, "w") as fn:
-        yaml.dump(tn, fn)
-
-
-if __name__ == "__main__":
-    gen_job()
--- a/benchmark/fluid/kube_templates/__init__.py
+++ b/benchmark/fluid/kube_templates/__init__.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pserver import pserver
-from trainer import trainer
-
-__all__ = ["pserver", "trainer", "envs"]
-
-envs = [
-    # envs that don't need to change
-    {
-        "name": "GLOG_v",
-        "value": "0"
-    },
-    {
-        "name": "GLOG_logtostderr",
-        "value": "1"
-    },
-    {
-        "name": "TOPOLOGY",
-        "value": ""
-    },
-    {
-        "name": "TRAINER_PACKAGE",
-        "value": "/workspace"
-    },
-    {
-        "name": "PADDLE_INIT_NICS",
-        "value": "eth2"
-    },
-    {
-        "name": "NAMESPACE",
-        "valueFrom": {
-            "fieldRef": {
-                "fieldPath": "metadata.namespace"
-            }
-        }
-    },
-    {
-        "name": "POD_IP",
-        "valueFrom": {
-            "fieldRef": {
-                "fieldPath": "status.podIP"
-            }
-        }
-    },
-    {
-        "name": "PADDLE_CURRENT_IP",
-        "valueFrom": {
-            "fieldRef": {
-                "fieldPath": "status.podIP"
-            }
-        }
-    }
-]
--- a/benchmark/fluid/kube_templates/pserver.py
+++ b/benchmark/fluid/kube_templates/pserver.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pserver = {
-    "apiVersion": "extensions/v1beta1",
-    "kind": "ReplicaSet",
-    "metadata": {
-        "name": "jobname-pserver"
-    },
-    "spec": {
-        "replicas": 1,
-        "template": {
-            "metadata": {
-                "labels": {
-                    "paddle-job-pserver": "jobname"
-                }
-            },
-            "spec": {
-                "hostNetwork": True,
-                "imagePullSecrets": [{
-                    "name": "job-registry-secret"
-                }],
-                "containers": [{
-                    "name": "pserver",
-                    "image": "",
-                    "imagePullPolicy": "Always",
-                    "ports": [{
-                        "name": "jobport-1",
-                        "containerPort": 1
-                    }],
-                    "env": [],
-                    "command": ["paddle_k8s", "start_pserver"],
-                    "resources": {
-                        "requests": {
-                            "memory": "10Gi",
-                            "cpu": "4"
-                        },
-                        "limits": {
-                            "memory": "10Gi",
-                            "cpu": "4"
-                        }
-                    }
-                }]
-            }
-        }
-    }
-}
--- a/benchmark/fluid/kube_templates/trainer.py
+++ b/benchmark/fluid/kube_templates/trainer.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-trainer = {
-    "apiVersion": "batch/v1",
-    "kind": "Job",
-    "metadata": {
-        "name": "jobname-pserver"
-    },
-    "spec": {
-        "parallelism": 4,
-        "completions": 4,
-        "template": {
-            "metadata": {
-                "labels": {
-                    "paddle-job": "jobname"
-                }
-            },
-            "spec": {
-                "hostNetwork": True,
-                "imagePullSecrets": [{
-                    "name": "job-registry-secret"
-                }],
-                "restartPolicy": "Never",
-                "containers": [{
-                    "name": "trainer",
-                    "image": "",
-                    "imagePullPolicy": "Always",
-                    # to let container set rlimit
-                    "securityContext": {
-                        "privileged": True
-                        # TODO(wuyi): use below specific cap instead of privileged,
-                        # using privileged will cause all GPU device are visible
-                        # in the container.
-                        # "capabilities": {
-                        #     "add": ["SYS_RESOURCE"]
-                        # }
-                    },
-                    "ports": [{
-                        "name": "jobport-1",
-                        "containerPort": 1
-                    }],
-                    "env": [],
-                    "command": ["paddle_k8s", "start_trainer", "v2"],
-                    "resources": {
-                        "requests": {
-                            "memory": "10Gi",
-                            "cpu": "4",
-                        },
-                        "limits": {
-                            "memory": "10Gi",
-                            "cpu": "4",
-                        }
-                    }
-                }]
-            }
-        }
-    }
-}
--- a/benchmark/fluid/models/__init__.py
+++ b/benchmark/fluid/models/__init__.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
-    "resnet_with_preprocess"
-]
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""seq2seq model for fluid."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import distutils.util
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.framework as framework
-from paddle.fluid.executor import Executor
-
-
-def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
-    def linear(inputs):
-        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
-
-    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
-
-    cell_t = fluid.layers.sums(input=[
-        fluid.layers.elementwise_mul(
-            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
-                x=input_gate, y=cell_tilde)
-    ])
-
-    hidden_t = fluid.layers.elementwise_mul(
-        x=output_gate, y=fluid.layers.tanh(x=cell_t))
-
-    return hidden_t, cell_t
-
-
-def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
-                   target_dict_dim, is_generating, beam_size, max_length):
-    """Construct a seq2seq network."""
-
-    def bi_lstm_encoder(input_seq, gate_size):
-        # Linear transformation part for input gate, output gate, forget gate
-        # and cell activation vectors need be done outside of dynamic_lstm.
-        # So the output size is 4 times of gate_size.
-        input_forward_proj = fluid.layers.fc(input=input_seq,
-                                             size=gate_size * 4,
-                                             act=None,
-                                             bias_attr=False)
-        forward, _ = fluid.layers.dynamic_lstm(
-            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
-        input_reversed_proj = fluid.layers.fc(input=input_seq,
-                                              size=gate_size * 4,
-                                              act=None,
-                                              bias_attr=False)
-        reversed, _ = fluid.layers.dynamic_lstm(
-            input=input_reversed_proj,
-            size=gate_size * 4,
-            is_reverse=True,
-            use_peepholes=False)
-        return forward, reversed
-
-    src_word_idx = fluid.layers.data(
-        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
-
-    src_embedding = fluid.layers.embedding(
-        input=src_word_idx,
-        size=[source_dict_dim, embedding_dim],
-        dtype='float32')
-
-    src_forward, src_reversed = bi_lstm_encoder(
-        input_seq=src_embedding, gate_size=encoder_size)
-
-    encoded_vector = fluid.layers.concat(
-        input=[src_forward, src_reversed], axis=1)
-
-    encoded_proj = fluid.layers.fc(input=encoded_vector,
-                                   size=decoder_size,
-                                   bias_attr=False)
-
-    backward_first = fluid.layers.sequence_pool(
-        input=src_reversed, pool_type='first')
-
-    decoder_boot = fluid.layers.fc(input=backward_first,
-                                   size=decoder_size,
-                                   bias_attr=False,
-                                   act='tanh')
-
-    def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
-                                    decoder_boot, decoder_size):
-        def simple_attention(encoder_vec, encoder_proj, decoder_state):
-            decoder_state_proj = fluid.layers.fc(input=decoder_state,
-                                                 size=decoder_size,
-                                                 bias_attr=False)
-            decoder_state_expand = fluid.layers.sequence_expand(
-                x=decoder_state_proj, y=encoder_proj)
-            concated = fluid.layers.concat(
-                input=[encoder_proj, decoder_state_expand], axis=1)
-            attention_weights = fluid.layers.fc(input=concated,
-                                                size=1,
-                                                act='tanh',
-                                                bias_attr=False)
-            attention_weights = fluid.layers.sequence_softmax(
-                input=attention_weights)
-            weigths_reshape = fluid.layers.reshape(
-                x=attention_weights, shape=[-1])
-            scaled = fluid.layers.elementwise_mul(
-                x=encoder_vec, y=weigths_reshape, axis=0)
-            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
-            return context
-
-        rnn = fluid.layers.DynamicRNN()
-
-        cell_init = fluid.layers.fill_constant_batch_size_like(
-            input=decoder_boot,
-            value=0.0,
-            shape=[-1, decoder_size],
-            dtype='float32')
-        cell_init.stop_gradient = False
-
-        with rnn.block():
-            current_word = rnn.step_input(target_embedding)
-            encoder_vec = rnn.static_input(encoder_vec)
-            encoder_proj = rnn.static_input(encoder_proj)
-            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
-            cell_mem = rnn.memory(init=cell_init)
-            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
-            decoder_inputs = fluid.layers.concat(
-                input=[context, current_word], axis=1)
-            h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
-            rnn.update_memory(hidden_mem, h)
-            rnn.update_memory(cell_mem, c)
-            out = fluid.layers.fc(input=h,
-                                  size=target_dict_dim,
-                                  bias_attr=True,
-                                  act='softmax')
-            rnn.output(out)
-        return rnn()
-
-    if not is_generating:
-        trg_word_idx = fluid.layers.data(
-            name='target_sequence', shape=[1], dtype='int64', lod_level=1)
-
-        trg_embedding = fluid.layers.embedding(
-            input=trg_word_idx,
-            size=[target_dict_dim, embedding_dim],
-            dtype='float32')
-
-        prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
-                                                 encoded_proj, decoder_boot,
-                                                 decoder_size)
-        label = fluid.layers.data(
-            name='label_sequence', shape=[1], dtype='int64', lod_level=1)
-        cost = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
-
-        return avg_cost, feeding_list
-
-
-def lodtensor_to_ndarray(lod_tensor):
-    dims = lod_tensor.get_dims()
-    ndarray = np.zeros(shape=dims).astype('float32')
-    for i in xrange(np.product(dims)):
-        ndarray.ravel()[i] = lod_tensor.get_float_element(i)
-    return ndarray
-
-
-def get_model(args, is_train, main_prog, startup_prog):
-    if args.use_reader_op:
-        raise Exception("machine_translation do not support reader op for now.")
-    embedding_dim = 512
-    encoder_size = 512
-    decoder_size = 512
-    dict_size = 30000
-    beam_size = 3
-    max_length = 250
-
-    with fluid.program_guard(main_prog, startup_prog):
-        with fluid.unique_name.guard():
-            avg_cost, feeding_list = seq_to_seq_net(
-                embedding_dim,
-                encoder_size,
-                decoder_size,
-                dict_size,
-                dict_size,
-                False,
-                beam_size=beam_size,
-                max_length=max_length)
-    if is_train:
-        optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-        optimizer.minimize(avg_cost)
-
-    batch_generator = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size)
-            if is_train else paddle.dataset.wmt14.test(dict_size),
-            buf_size=1000),
-        batch_size=args.batch_size * args.gpus)
-
-    return avg_cost, optimizer, [], batch_generator, None
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import cProfile
-import os
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-
-SEED = 1
-DTYPE = "float32"
-
-# random seed must set before configuring the network.
-# fluid.default_startup_program().random_seed = SEED
-
-
-def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=data,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-
-    # TODO(dzhwinter) : refine the initializer and random seed settting
-    SIZE = 10
-    input_shape = conv_pool_2.shape
-    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
-    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
-
-    predict = fluid.layers.fc(
-        input=conv_pool_2,
-        size=SIZE,
-        act="softmax",
-        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale)))
-    return predict
-
-
-def get_model(args, is_train, main_prog, startup_prog):
-    # NOTE: mnist is small, we don't implement data sharding yet.
-    opt = None
-    data_file_handle = None
-    with fluid.program_guard(main_prog, startup_prog):
-        if args.use_reader_op:
-            filelist = [
-                os.path.join(args.data_path, f)
-                for f in os.listdir(args.data_path)
-            ]
-            data_file_handle = fluid.layers.open_files(
-                filenames=filelist,
-                shapes=[[-1, 1, 28, 28], (-1, 1)],
-                lod_levels=[0, 0],
-                dtypes=["float32", "int64"],
-                thread_num=1,
-                pass_num=1)
-            data_file = fluid.layers.double_buffer(
-                fluid.layers.batch(
-                    data_file_handle, batch_size=args.batch_size))
-        with fluid.unique_name.guard():
-            if args.use_reader_op:
-                input, label = fluid.layers.read_file(data_file)
-            else:
-                images = fluid.layers.data(
-                    name='pixel', shape=[1, 28, 28], dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
-
-            predict = cnn_model(images)
-            cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
-            # Evaluator
-            batch_acc = fluid.layers.accuracy(input=predict, label=label)
-            # Optimization
-            if is_train:
-                opt = fluid.optimizer.AdamOptimizer(
-                    learning_rate=0.001, beta1=0.9, beta2=0.999)
-                opt.minimize(avg_cost)
-                if args.memory_optimize:
-                    fluid.memory_optimize(main_prog)
-
-    # Reader
-    if is_train:
-        reader = paddle.dataset.mnist.train()
-    else:
-        reader = paddle.dataset.mnist.test()
-    batched_reader = paddle.batch(
-        reader, batch_size=args.batch_size * args.gpus)
-    return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import numpy as np
-import time
-import os
-import math
-
-import cProfile, pstats, StringIO
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-from imagenet_reader import train, val
-
-train_parameters = {
-    "input_size": [3, 224, 224],
-    "input_mean": [0.485, 0.456, 0.406],
-    "input_std": [0.229, 0.224, 0.225],
-    "learning_strategy": {
-        "name": "piecewise_decay",
-        "batch_size": 256,
-        "epochs": [30, 60, 90],
-        "steps": [0.1, 0.01, 0.001, 0.0001]
-    }
-}
-
-
-class ResNet():
-    def __init__(self, layers=50, is_train=True):
-        self.params = train_parameters
-        self.layers = layers
-        self.is_train = is_train
-
-    def net(self, input, class_dim=1000):
-        layers = self.layers
-        supported_layers = [50, 101, 152]
-        assert layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(supported_layers, layers)
-
-        if layers == 50:
-            depth = [3, 4, 6, 3]
-        elif layers == 101:
-            depth = [3, 4, 23, 3]
-        elif layers == 152:
-            depth = [3, 8, 36, 3]
-        num_filters = [64, 128, 256, 512]
-
-        conv = self.conv_bn_layer(
-            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
-        conv = fluid.layers.pool2d(
-            input=conv,
-            pool_size=3,
-            pool_stride=2,
-            pool_padding=1,
-            pool_type='max')
-
-        for block in range(len(depth)):
-            for i in range(depth[block]):
-                conv = self.bottleneck_block(
-                    input=conv,
-                    num_filters=num_filters[block],
-                    stride=2 if i == 0 and block != 0 else 1)
-
-        pool = fluid.layers.pool2d(
-            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
-        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-        out = fluid.layers.fc(input=pool,
-                              size=class_dim,
-                              act='softmax',
-                              param_attr=fluid.param_attr.ParamAttr(
-                                  initializer=fluid.initializer.Uniform(-stdv,
-                                                                        stdv)))
-        return out
-
-    def conv_bn_layer(self,
-                      input,
-                      num_filters,
-                      filter_size,
-                      stride=1,
-                      groups=1,
-                      act=None):
-        conv = fluid.layers.conv2d(
-            input=input,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
-        return fluid.layers.batch_norm(
-            input=conv, act=act, is_test=not self.is_train)
-
-    def shortcut(self, input, ch_out, stride):
-        ch_in = input.shape[1]
-        if ch_in != ch_out or stride != 1:
-            return self.conv_bn_layer(input, ch_out, 1, stride)
-        else:
-            return input
-
-    def bottleneck_block(self, input, num_filters, stride):
-        conv0 = self.conv_bn_layer(
-            input=input, num_filters=num_filters, filter_size=1, act='relu')
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
-        conv2 = self.conv_bn_layer(
-            input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
-
-        short = self.shortcut(input, num_filters * 4, stride)
-
-        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-
-def _model_reader_dshape_classdim(args, is_train):
-    model = None
-    reader = None
-    if args.data_set == "flowers":
-        class_dim = 102
-        if args.data_format == 'NCHW':
-            dshape = [3, 224, 224]
-        else:
-            dshape = [224, 224, 3]
-        if is_train:
-            reader = paddle.dataset.flowers.train()
-        else:
-            reader = paddle.dataset.flowers.test()
-    elif args.data_set == "imagenet":
-        class_dim = 1000
-        if args.data_format == 'NCHW':
-            dshape = [3, 224, 224]
-        else:
-            dshape = [224, 224, 3]
-        if not args.data_path:
-            raise Exception(
-                "Must specify --data_path when training with imagenet")
-        if not args.use_reader_op:
-            if is_train:
-                reader = train()
-            else:
-                reader = val()
-        else:
-            if is_train:
-                reader = train(xmap=False)
-            else:
-                reader = val(xmap=False)
-    return reader, dshape, class_dim
-
-
-def get_model(args, is_train, main_prog, startup_prog):
-    reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train)
-
-    pyreader = None
-    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
-    with fluid.program_guard(main_prog, startup_prog):
-        with fluid.unique_name.guard():
-            if args.use_reader_op:
-                pyreader = fluid.layers.py_reader(
-                    capacity=args.batch_size * args.gpus,
-                    shapes=([-1] + dshape, (-1, 1)),
-                    dtypes=('float32', 'int64'),
-                    name="train_reader" if is_train else "test_reader",
-                    use_double_buffer=True)
-                input, label = fluid.layers.read_file(pyreader)
-            else:
-                input = fluid.layers.data(
-                    name='data', shape=dshape, dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
-
-            model = ResNet(is_train=is_train)
-            predict = model.net(input, class_dim=class_dim)
-            cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
-
-            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
-            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
-
-            # configure optimize
-            optimizer = None
-            if is_train:
-                total_images = 1281167 / trainer_count
-
-                step = int(total_images / (args.batch_size * args.gpus) + 1)
-                epochs = [30, 60, 90]
-                bd = [step * e for e in epochs]
-                base_lr = args.learning_rate
-                lr = []
-                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-                optimizer = fluid.optimizer.Momentum(
-                    learning_rate=fluid.layers.piecewise_decay(
-                        boundaries=bd, values=lr),
-                    momentum=0.9,
-                    regularization=fluid.regularizer.L2Decay(1e-4))
-                optimizer.minimize(avg_cost)
-
-                if args.memory_optimize:
-                    fluid.memory_optimize(main_prog)
-
-    # config readers
-    if not args.use_reader_op:
-        batched_reader = paddle.batch(
-            reader if args.no_random else paddle.reader.shuffle(
-                reader, buf_size=5120),
-            batch_size=args.batch_size * args.gpus,
-            drop_last=True)
-    else:
-        batched_reader = None
-        pyreader.decorate_paddle_reader(
-            paddle.batch(
-                reader if args.no_random else paddle.reader.shuffle(
-                    reader, buf_size=5120),
-                batch_size=args.batch_size))
-
-    return avg_cost, optimizer, [batch_acc1,
-                                 batch_acc5], batched_reader, pyreader
--- a/benchmark/fluid/models/resnet_with_preprocess.py
+++ b/benchmark/fluid/models/resnet_with_preprocess.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import numpy as np
-import time
-import os
-
-import cProfile, pstats, StringIO
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-# from recordio_converter import imagenet_train, imagenet_test
-from imagenet_reader import train_raw, val
-
-
-def conv_bn_layer(input,
-                  ch_out,
-                  filter_size,
-                  stride,
-                  padding,
-                  act='relu',
-                  is_train=True):
-    conv1 = fluid.layers.conv2d(
-        input=input,
-        filter_size=filter_size,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
-
-
-def shortcut(input, ch_out, stride, is_train=True):
-    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
-    if ch_in != ch_out:
-        return conv_bn_layer(
-            input, ch_out, 1, stride, 0, None, is_train=is_train)
-    else:
-        return input
-
-
-def basicblock(input, ch_out, stride, is_train=True):
-    short = shortcut(input, ch_out, stride, is_train=is_train)
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
-    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-
-def bottleneck(input, ch_out, stride, is_train=True):
-    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
-    conv3 = conv_bn_layer(
-        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
-    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
-
-
-def layer_warp(block_func, input, ch_out, count, stride):
-    res_out = block_func(input, ch_out, stride)
-    for i in range(1, count):
-        res_out = block_func(res_out, ch_out, 1)
-    return res_out
-
-
-def resnet_imagenet(input,
-                    class_dim,
-                    depth=50,
-                    data_format='NCHW',
-                    is_train=True):
-
-    cfg = {
-        18: ([2, 2, 2, 1], basicblock),
-        34: ([3, 4, 6, 3], basicblock),
-        50: ([3, 4, 6, 3], bottleneck),
-        101: ([3, 4, 23, 3], bottleneck),
-        152: ([3, 8, 36, 3], bottleneck)
-    }
-    stages, block_func = cfg[depth]
-    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
-    pool1 = fluid.layers.pool2d(
-        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
-    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
-    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
-    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
-    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
-    pool2 = fluid.layers.pool2d(
-        input=res4,
-        pool_size=7,
-        pool_type='avg',
-        pool_stride=1,
-        global_pooling=True)
-    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
-    return out
-
-
-def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
-    assert (depth - 2) % 6 == 0
-
-    n = (depth - 2) // 6
-
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
-    return out
-
-
-def _model_reader_dshape_classdim(args, is_train):
-    model = resnet_cifar10
-    reader = None
-    if args.data_set == "cifar10":
-        class_dim = 10
-        if args.data_format == 'NCHW':
-            dshape = [3, 32, 32]
-        else:
-            dshape = [32, 32, 3]
-        model = resnet_cifar10
-        if is_train:
-            reader = paddle.dataset.cifar.train10()
-        else:
-            reader = paddle.dataset.cifar.test10()
-    elif args.data_set == "flowers":
-        class_dim = 102
-        if args.data_format == 'NCHW':
-            dshape = [3, 224, 224]
-        else:
-            dshape = [224, 224, 3]
-        model = resnet_imagenet
-        if is_train:
-            reader = paddle.dataset.flowers.train()
-        else:
-            reader = paddle.dataset.flowers.test()
-    elif args.data_set == "imagenet":
-        class_dim = 1000
-        if args.data_format == 'NCHW':
-            dshape = [3, 224, 224]
-        else:
-            dshape = [224, 224, 3]
-        model = resnet_imagenet
-        if not args.data_path:
-            raise Exception(
-                "Must specify --data_path when training with imagenet")
-        if not args.use_reader_op:
-            if is_train:
-                reader = train_raw()
-            else:
-                reader = val()
-        else:
-            if is_train:
-                reader = train_raw()
-            else:
-                reader = val(xmap=False)
-    return model, reader, dshape, class_dim
-
-
-def get_model(args, is_train, main_prog, startup_prog):
-    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
-                                                                     is_train)
-
-    pyreader = None
-    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
-    with fluid.program_guard(main_prog, startup_prog):
-        with fluid.unique_name.guard():
-            if args.use_reader_op:
-                pyreader = fluid.layers.py_reader(
-                    capacity=args.batch_size * args.gpus,
-                    shapes=([-1] + dshape, (-1, 1)),
-                    dtypes=('uint8', 'int64'),
-                    name="train_reader" if is_train else "test_reader",
-                    use_double_buffer=True)
-                input, label = fluid.layers.read_file(pyreader)
-            else:
-                input = fluid.layers.data(
-                    name='data', shape=dshape, dtype='uint8')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
-
-            # add imagenet preprocessors
-            random_crop = fluid.layers.random_crop(input, dshape)
-            casted = fluid.layers.cast(random_crop, 'float32')
-            # input is HWC
-            trans = fluid.layers.transpose(casted, [0, 3, 1, 2]) / 255.0
-            img_mean = fluid.layers.tensor.assign(
-                np.array([0.485, 0.456, 0.406]).astype('float32').reshape((3, 1,
-                                                                           1)))
-            img_std = fluid.layers.tensor.assign(
-                np.array([0.229, 0.224, 0.225]).astype('float32').reshape((3, 1,
-                                                                           1)))
-            h1 = fluid.layers.elementwise_sub(trans, img_mean, axis=1)
-            h2 = fluid.layers.elementwise_div(h1, img_std, axis=1)
-
-            # pre_out = (trans - img_mean) / img_std
-
-            predict = model(h2, class_dim, is_train=is_train)
-            cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
-
-            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
-            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
-
-            # configure optimize
-            optimizer = None
-            if is_train:
-                total_images = 1281167 / trainer_count
-
-                step = int(total_images / args.batch_size + 1)
-                epochs = [30, 60, 80, 90]
-                bd = [step * e for e in epochs]
-                base_lr = args.learning_rate
-                lr = []
-                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-                optimizer = fluid.optimizer.Momentum(
-                    learning_rate=base_lr,
-                    #learning_rate=fluid.layers.piecewise_decay(
-                    #    boundaries=bd, values=lr),
-                    momentum=0.9,
-                    regularization=fluid.regularizer.L2Decay(1e-4))
-                optimizer.minimize(avg_cost)
-
-                if args.memory_optimize:
-                    fluid.memory_optimize(main_prog)
-
-    # config readers
-    if not args.use_reader_op:
-        batched_reader = paddle.batch(
-            reader if args.no_random else paddle.reader.shuffle(
-                reader, buf_size=5120),
-            batch_size=args.batch_size * args.gpus,
-            drop_last=True)
-    else:
-        batched_reader = None
-        pyreader.decorate_paddle_reader(
-            paddle.batch(
-                # reader if args.no_random else paddle.reader.shuffle(
-                #     reader, buf_size=5120),
-                reader,
-                batch_size=args.batch_size))
-
-    return avg_cost, optimizer, [batch_acc1,
-                                 batch_acc5], batched_reader, pyreader
--- a/benchmark/fluid/models/se_resnext.py
+++ b/benchmark/fluid/models/se_resnext.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.fluid as fluid
-import math
-import os
-from imagenet_reader import train, val
-
-__all__ = [
-    "SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
-    "SE_ResNeXt152_32x4d", "get_model"
-]
-
-train_parameters = {
-    "input_size": [3, 224, 224],
-    "input_mean": [0.485, 0.456, 0.406],
-    "input_std": [0.229, 0.224, 0.225],
-    "learning_strategy": {
-        "name": "piecewise_decay",
-        "batch_size": 256,
-        "epochs": [30, 60, 90],
-        "steps": [0.1, 0.01, 0.001, 0.0001]
-    }
-}
-
-
-class SE_ResNeXt():
-    def __init__(self, layers=50, is_train=True):
-        self.params = train_parameters
-        self.layers = layers
-        self.is_train = is_train
-
-    def net(self, input, class_dim=1000):
-        layers = self.layers
-        supported_layers = [50, 101, 152]
-        assert layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(supported_layers, layers)
-        if layers == 50:
-            cardinality = 32
-            reduction_ratio = 16
-            depth = [3, 4, 6, 3]
-            num_filters = [128, 256, 512, 1024]
-
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            conv = fluid.layers.pool2d(
-                input=conv,
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-        elif layers == 101:
-            cardinality = 32
-            reduction_ratio = 16
-            depth = [3, 4, 23, 3]
-            num_filters = [128, 256, 512, 1024]
-
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            conv = fluid.layers.pool2d(
-                input=conv,
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-        elif layers == 152:
-            cardinality = 64
-            reduction_ratio = 16
-            depth = [3, 8, 36, 3]
-            num_filters = [128, 256, 512, 1024]
-
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=64,
-                filter_size=3,
-                stride=2,
-                act='relu')
-            conv = self.conv_bn_layer(
-                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
-            conv = self.conv_bn_layer(
-                input=conv,
-                num_filters=128,
-                filter_size=3,
-                stride=1,
-                act='relu')
-            conv = fluid.layers.pool2d(
-                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
-                pool_type='max')
-
-        for block in range(len(depth)):
-            for i in range(depth[block]):
-                conv = self.bottleneck_block(
-                    input=conv,
-                    num_filters=num_filters[block],
-                    stride=2 if i == 0 and block != 0 else 1,
-                    cardinality=cardinality,
-                    reduction_ratio=reduction_ratio)
-
-        pool = fluid.layers.pool2d(
-            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
-        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
-        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
-        out = fluid.layers.fc(input=drop,
-                              size=class_dim,
-                              act='softmax',
-                              param_attr=fluid.param_attr.ParamAttr(
-                                  initializer=fluid.initializer.Uniform(-stdv,
-                                                                        stdv)))
-        return out
-
-    def shortcut(self, input, ch_out, stride):
-        ch_in = input.shape[1]
-        if ch_in != ch_out or stride != 1:
-            filter_size = 1
-            return self.conv_bn_layer(input, ch_out, filter_size, stride)
-        else:
-            return input
-
-    def bottleneck_block(self, input, num_filters, stride, cardinality,
-                         reduction_ratio):
-        conv0 = self.conv_bn_layer(
-            input=input, num_filters=num_filters, filter_size=1, act='relu')
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            groups=cardinality,
-            act='relu')
-        conv2 = self.conv_bn_layer(
-            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
-        scale = self.squeeze_excitation(
-            input=conv2,
-            num_channels=num_filters * 2,
-            reduction_ratio=reduction_ratio)
-
-        short = self.shortcut(input, num_filters * 2, stride)
-
-        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
-
-    def conv_bn_layer(self,
-                      input,
-                      num_filters,
-                      filter_size,
-                      stride=1,
-                      groups=1,
-                      act=None):
-        conv = fluid.layers.conv2d(
-            input=input,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) / 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
-        return fluid.layers.batch_norm(
-            input=conv, act=act, is_test=not self.is_train)
-
-    def squeeze_excitation(self, input, num_channels, reduction_ratio):
-        pool = fluid.layers.pool2d(
-            input=input, pool_size=0, pool_type='avg', global_pooling=True)
-        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-        squeeze = fluid.layers.fc(input=pool,
-                                  size=num_channels / reduction_ratio,
-                                  act='relu',
-                                  param_attr=fluid.param_attr.ParamAttr(
-                                      initializer=fluid.initializer.Uniform(
-                                          -stdv, stdv)))
-        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
-        excitation = fluid.layers.fc(input=squeeze,
-                                     size=num_channels,
-                                     act='sigmoid',
-                                     param_attr=fluid.param_attr.ParamAttr(
-                                         initializer=fluid.initializer.Uniform(
-                                             -stdv, stdv)))
-        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
-        return scale
-
-
-def SE_ResNeXt50_32x4d():
-    model = SE_ResNeXt(layers=50)
-    return model
-
-
-def SE_ResNeXt101_32x4d():
-    model = SE_ResNeXt(layers=101)
-    return model
-
-
-def SE_ResNeXt152_32x4d():
-    model = SE_ResNeXt(layers=152)
-    return model
-
-
-def get_model(args, is_train, main_prog, startup_prog):
-    model = SE_ResNeXt(layers=50)
-    batched_reader = None
-    pyreader = None
-    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
-    dshape = train_parameters["input_size"]
-
-    with fluid.program_guard(main_prog, startup_prog):
-        with fluid.unique_name.guard():
-            if args.use_reader_op:
-                pyreader = fluid.layers.py_reader(
-                    capacity=10,
-                    shapes=([-1] + dshape, (-1, 1)),
-                    dtypes=('float32', 'int64'),
-                    name="train_reader" if is_train else "test_reader",
-                    use_double_buffer=True)
-                input, label = fluid.layers.read_file(pyreader)
-            else:
-                input = fluid.layers.data(
-                    name='data', shape=dshape, dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
-
-            out = model.net(input=input)
-            cost = fluid.layers.cross_entropy(input=out, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
-            acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-            acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-
-            optimizer = None
-            if is_train:
-                total_images = 1281167 / trainer_count
-
-                step = int(total_images / args.batch_size + 1)
-                epochs = [40, 80, 100]
-                bd = [step * e for e in epochs]
-                base_lr = args.learning_rate
-                lr = []
-                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-                optimizer = fluid.optimizer.Momentum(
-                    # learning_rate=base_lr,
-                    learning_rate=fluid.layers.piecewise_decay(
-                        boundaries=bd, values=lr),
-                    momentum=0.9,
-                    regularization=fluid.regularizer.L2Decay(1e-4))
-                optimizer.minimize(avg_cost)
-
-                if args.memory_optimize:
-                    fluid.memory_optimize(main_prog)
-
-    # config readers
-    if is_train:
-        reader = train()
-    else:
-        reader = val()
-
-    if not args.use_reader_op:
-        batched_reader = paddle.batch(
-            reader, batch_size=args.batch_size * args.gpus, drop_last=True)
-    else:
-        pyreader.decorate_paddle_reader(
-            paddle.batch(
-                reader, batch_size=args.batch_size))
-
-    return avg_cost, optimizer, [acc_top1, acc_top5], batched_reader, pyreader
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import cPickle
-import os
-import random
-import time
-
-import numpy
-import paddle
-import paddle.dataset.imdb as imdb
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-
-word_dict = imdb.word_dict()
-
-
-def crop_sentence(reader, crop_size):
-    unk_value = word_dict['<unk>']
-
-    def __impl__():
-        for item in reader():
-            if len([x for x in item[0] if x != unk_value]) < crop_size:
-                yield item
-
-    return __impl__
-
-
-def lstm_net(sentence, lstm_size):
-    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
-
-    rnn = fluid.layers.DynamicRNN()
-    with rnn.block():
-        word = rnn.step_input(sentence)
-        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
-        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
-
-        def gate_common(
-                ipt,
-                hidden,
-                size, ):
-            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
-            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
-            gate = fluid.layers.sums(input=[gate0, gate1])
-            return gate
-
-        forget_gate = fluid.layers.sigmoid(
-            x=gate_common(word, prev_hidden, lstm_size))
-        input_gate = fluid.layers.sigmoid(
-            x=gate_common(word, prev_hidden, lstm_size))
-        output_gate = fluid.layers.sigmoid(
-            x=gate_common(word, prev_hidden, lstm_size))
-        cell_gate = fluid.layers.tanh(
-            x=gate_common(word, prev_hidden, lstm_size))
-
-        cell = fluid.layers.sums(input=[
-            fluid.layers.elementwise_mul(
-                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
-                    x=input_gate, y=cell_gate)
-        ])
-
-        hidden = fluid.layers.elementwise_mul(
-            x=output_gate, y=fluid.layers.tanh(x=cell))
-
-        rnn.update_memory(prev_cell, cell)
-        rnn.update_memory(prev_hidden, hidden)
-        rnn.output(hidden)
-
-    last = fluid.layers.sequence_pool(rnn(), 'last')
-    logit = fluid.layers.fc(input=last, size=2, act='softmax')
-    return logit
-
-
-def get_model(args, is_train, main_prog, startup_prog):
-    if args.use_reader_op:
-        raise Exception(
-            "stacked_dynamic_lstm do not support reader op for now.")
-    lstm_size = 512
-    emb_dim = 512
-    crop_size = 1500
-
-    with fluid.program_guard(main_prog, startup_prog):
-        with fluid.unique_name.guard():
-            data = fluid.layers.data(
-                name="words", shape=[1], lod_level=1, dtype='int64')
-            sentence = fluid.layers.embedding(
-                input=data, size=[len(word_dict), emb_dim])
-            logit = lstm_net(sentence, lstm_size)
-            loss = fluid.layers.cross_entropy(
-                input=logit,
-                label=fluid.layers.data(
-                    name='label', shape=[1], dtype='int64'))
-            loss = fluid.layers.mean(x=loss)
-
-            # add acc
-            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-            batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
-                        shape=[1], dtype='int64'), total=batch_size_tensor)
-
-            if is_train:
-                adam = fluid.optimizer.Adam()
-                adam.minimize(loss)
-
-    if is_train:
-        reader = crop_sentence(imdb.train(word_dict), crop_size)
-    else:
-        reader = crop_sentence(imdb.test(word_dict), crop_size)
-
-    batched_reader = paddle.batch(
-        paddle.reader.shuffle(
-            reader, buf_size=25000),
-        batch_size=args.batch_size * args.gpus)
-
-    return loss, adam, [batch_acc], batched_reader, None
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import argparse
-import functools
-import os
-
-
-def vgg16_bn_drop(input, is_train=True):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
-    return fc2
-
-
-def get_model(args, is_train, main_prog, startup_prog):
-    if args.data_set == "cifar10":
-        classdim = 10
-        if args.data_format == 'NCHW':
-            data_shape = [3, 32, 32]
-        else:
-            data_shape = [32, 32, 3]
-    else:
-        classdim = 102
-        if args.data_format == 'NCHW':
-            data_shape = [3, 224, 224]
-        else:
-            data_shape = [224, 224, 3]
-    filelist = [
-        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-    ]
-    with fluid.program_guard(main_prog, startup_prog):
-        if args.use_reader_op:
-            data_file_handle = fluid.layers.open_files(
-                filenames=filelist,
-                shapes=[[-1] + data_shape, (-1, 1)],
-                lod_levels=[0, 0],
-                dtypes=["float32", "int64"],
-                thread_num=1,
-                pass_num=1)
-            data_file = fluid.layers.double_buffer(
-                fluid.layers.batch(
-                    data_file_handle, batch_size=args.batch_size))
-        with fluid.unique_name.guard():
-            if args.use_reader_op:
-                images, label = fluid.layers.read_file(data_file)
-            else:
-                images = fluid.layers.data(
-                    name='data', shape=data_shape, dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
-            # Train program
-            net = vgg16_bn_drop(images, is_train=is_train)
-            predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-            cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
-
-            # Evaluator
-            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-            batch_acc = fluid.layers.accuracy(
-                input=predict, label=label, total=batch_size_tensor)
-            # Optimization
-            if is_train:
-                optimizer = fluid.optimizer.Adam(
-                    learning_rate=args.learning_rate)
-                optimizer.minimize(avg_cost)
-
-    # data reader
-    if is_train:
-        reader = paddle.dataset.cifar.train10() \
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
-    else:
-        reader = paddle.dataset.cifar.test10() \
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
-
-    batched_reader = paddle.batch(
-        paddle.reader.shuffle(
-            reader, buf_size=5120),
-        batch_size=args.batch_size * args.gpus)
-
-    return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle
--- a/benchmark/fluid/recordio_converter.py
+++ b/benchmark/fluid/recordio_converter.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import random
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.dataset import mnist, cifar, flowers, image
-
-
-def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
-                       shape_label):
-    num_batches = 0
-    with fluid.program_guard(fluid.Program(), fluid.Program()):
-        reader = paddle.batch(py_reader(), batch_size=batch_size)
-        feeder = fluid.DataFeeder(
-            feed_list=[  # order is image and label
-                fluid.layers.data(
-                    name='image', shape=shape_data),
-                fluid.layers.data(
-                    name='label', shape=shape_label, dtype='int64'),
-            ],
-            place=fluid.CPUPlace())
-        num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
-            outfilepath, reader, feeder)
-    return num_batches
-
-
-def prepare_mnist(outpath, batch_size):
-    outfilepath = os.path.join(outpath, "mnist.recordio")
-    convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
-
-
-def prepare_cifar10(outpath, batch_size):
-    outfilepath = os.path.join(outpath, "cifar.recordio")
-    convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
-
-
-def prepare_flowers(outpath, batch_size):
-    outfilepath = os.path.join(outpath, "flowers.recordio")
-    convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
-                       [1])
-
-
-def default_mapper(sample):
-    img, label = sample
-    img = image.simple_transform(
-        img, 256, 224, True, mean=[103.94, 116.78, 123.68])
-    return img.flatten().astype('float32'), label
-
-
-def imagenet_train(data_dir):
-    contents = os.listdir(data_dir)
-    if set(contents) != set(
-        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
-        raise Exception("Imagenet data contents error!")
-    img2label = dict()
-    imgfilelist = []
-    with open(os.path.join(data_dir, "train.txt")) as fn:
-        while 1:
-            l = fn.readline()
-            if not l:
-                break
-            img, lbl = l[:-1].split(" ")
-            img2label[img] = int(lbl)
-            imgfilelist.append(img)
-    # shuffle all, this is slow
-    random.shuffle(imgfilelist)
-
-    def train_reader():
-        for idx, imgfile in enumerate(imgfilelist):
-            data = image.load_image(
-                os.path.join(data_dir, "train", imgfile.lower()))
-            label = [img2label[imgfile], ]
-            yield [data, label]
-
-    return paddle.reader.map_readers(default_mapper, train_reader)
-
-
-def imagenet_test(data_dir):
-    contents = os.listdir(data_dir)
-    if set(contents) != set(
-        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
-        raise Exception("Imagenet data contents error!")
-    img2label = dict()
-    imgfilelist = []
-    with open(os.path.join(data_dir, "val.txt")) as fn:
-        while 1:
-            l = fn.readline()
-            if not l:
-                break
-            img, lbl = l[:-1].split(" ")
-            img2label[img] = int(lbl)
-            imgfilelist.append(img)
-
-    def test_reader():
-        for idx, imgfile in enumerate(imgfilelist):
-            base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
-            image_path = ".".join([base_path, "jpeg"])
-            data = image.load_image(image_path)
-            label = [img2label[imgfile], ]
-            yield [data, label]
-
-    return paddle.reader.map_readers(default_mapper, test_reader)
-
-
-# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
-def convert_reader_to_recordio_files(
-        filename,
-        batch_per_file,
-        reader_creator,
-        feeder,
-        compressor=core.RecordIOWriter.Compressor.Snappy,
-        max_num_records=1000,
-        feed_order=None):
-    if feed_order is None:
-        feed_order = feeder.feed_names
-    f_name, f_ext = os.path.splitext(filename)
-    assert (f_ext == ".recordio")
-
-    lines = []
-    f_idx = 0
-    counter = 0
-    for idx, batch in enumerate(reader_creator()):
-        lines.append(batch)
-        if idx >= batch_per_file and idx % batch_per_file == 0:
-            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
-            with fluid.recordio_writer.create_recordio_writer(
-                    filename, compressor, max_num_records) as writer:
-                for l in lines:
-                    res = feeder.feed(l)
-                    for each in feed_order:
-                        writer.append_tensor(res[each])
-                    writer.complete_append_tensor()
-                    counter += 1
-                lines = []
-                f_idx += 1
-            print("written file: ", filename)
-    return counter
-
-
-def prepare_imagenet(inpath, outpath, batch_size):
-    r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
-    feeder = fluid.DataFeeder(
-        feed_list=[
-            fluid.layers.data(
-                name="image", shape=[3, 224, 224]), fluid.layers.data(
-                    name="label", shape=[1], dtype='int64')
-        ],
-        place=fluid.CPUPlace())
-    outpath = os.path.join(outpath, "imagenet.recordio")
-    convert_reader_to_recordio_files(outpath, 10000, r, feeder)
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
-#!/bin/bash
-# This script benchmarking the PaddlePaddle Fluid on
-# single thread single GPU.
-
-mkdir -p logs
-#export FLAGS_fraction_of_gpu_memory_to_use=0.0
-export CUDNN_PATH=/paddle/cudnn_v5
-
-# disable openmp and mkl parallel
-#https://github.com/PaddlePaddle/Paddle/issues/7199
-export MKL_NUM_THREADS=1
-export OMP_NUM_THREADS=1
-ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
-if [ $ht -eq 1 ]; then # HT is OFF
-    if [ -z "$KMP_AFFINITY" ]; then
-        export KMP_AFFINITY="granularity=fine,compact,0,0"
-    fi
-    if [ -z "$OMP_DYNAMIC" ]; then
-        export OMP_DYNAMIC="FALSE"
-    fi
-else # HT is ON
-    if [ -z "$KMP_AFFINITY" ]; then
-        export KMP_AFFINITY="granularity=fine,compact,1,0"
-    fi
-fi
-# disable multi-gpu if have more than one
-export CUDA_VISIBLE_DEVICES=0
-export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
-
-# only query the gpu used
-nohup stdbuf -oL nvidia-smi \
-      --id=${CUDA_VISIBLE_DEVICES} \
-      --query-gpu=timestamp \
-      --query-compute-apps=pid,process_name,used_memory \
-      --format=csv \
-      --filename=mem.log  \
-      -l 1 &
-
-# mnist
-# mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=mnist \
-               --device=GPU \
-               --batch_size=128 \
-               --skip_batch_num=5 \
-               --iterations=500 \
-               2>&1 | tee -a logs/mnist_gpu_128.log
-
-# vgg16
-# gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=vgg16 \
-               --device=GPU \
-               --batch_size=128 \
-               --skip_batch_num=5 \
-               --iterations=30 \
-               2>&1 | tee -a logs/vgg16_gpu_128.log
-
-# flowers gpu  128
-FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=vgg16 \
-               --device=GPU \
-               --batch_size=32 \
-               --data_set=flowers \
-               --skip_batch_num=5 \
-               --iterations=30 \
-               2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
-
-# resnet50
-# resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=resnet \
-               --device=GPU \
-               --batch_size=128 \
-               --data_set=cifar10 \
-               --skip_batch_num=5 \
-               --iterations=30 \
-               2>&1 | tee -a logs/resnet50_gpu_128.log
-
-# resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=resnet \
-               --device=GPU \
-               --batch_size=64 \
-               --data_set=flowers \
-               --skip_batch_num=5 \
-               --iterations=30 \
-               2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
-
-# lstm
-# lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=stacked_dynamic_lstm \
-               --device=GPU \
-               --batch_size=32 \
-               --skip_batch_num=5 \
-               --iterations=30 \
-               2>&1 | tee -a logs/lstm_gpu_32.log
-
-# seq2seq
-# seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=machine_translation \
-               --device=GPU \
-               --batch_size=128 \
-               --skip_batch_num=5 \
-               --iterations=30 \
-               2>&1 | tee -a logs/lstm_gpu_128.log
--- a/benchmark/fluid/run_fluid_benchmark.sh
+++ b/benchmark/fluid/run_fluid_benchmark.sh
-#!/bin/bash
-
-PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
-
-sleep 15
-
-CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
-
-CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
--- a/benchmark/tensorflow/image/alexnet.py
+++ b/benchmark/tensorflow/image/alexnet.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-from datetime import datetime
-import math
-import time
-
-import tensorflow.python.platform
-import tensorflow as tf
-
-FLAGS = tf.app.flags.FLAGS
-
-tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
-tf.app.flags.DEFINE_boolean('forward_only', False,
-                            """Only run the forward pass.""")
-tf.app.flags.DEFINE_boolean('forward_backward_only', False,
-                            """Only run the forward-forward pass.""")
-tf.app.flags.DEFINE_string('data_format', 'NCHW',
-                           """The data format for Convnet operations.
-                           Can be either NHWC or NCHW.
-                           """)
-tf.app.flags.DEFINE_boolean('log_device_placement', False,
-                            """Whether to log device placement.""")
-
-
-def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
-    with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(
-            name + '_w', [kH, kW, nIn, nOut],
-            initializer=tf.truncated_normal_initializer(
-                stddev=0.01, dtype=tf.float32),
-            dtype=tf.float32)
-
-        if wd is not None and wd > 0:
-            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
-            tf.add_to_collection('losses', weight_decay)
-
-        if FLAGS.data_format == 'NCHW':
-            strides = [1, 1, dH, dW]
-        else:
-            strides = [1, dH, dW, 1]
-        conv = tf.nn.conv2d(
-            inpOp,
-            kernel,
-            strides,
-            padding=padType,
-            data_format=FLAGS.data_format)
-
-        biases = tf.get_variable(
-            name=name + '_b',
-            shape=[nOut],
-            initializer=tf.constant_initializer(
-                value=0.0, dtype=tf.float32),
-            dtype=tf.float32)
-
-        bias = tf.reshape(
-            tf.nn.bias_add(
-                conv, biases, data_format=FLAGS.data_format),
-            conv.get_shape())
-
-        conv1 = tf.nn.relu(bias, name=scope)
-        return conv1
-
-
-def _affine(name, inpOp, nIn, nOut, wd=0.0005, act=True, drop=None):
-    with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(
-            name + '_w', [nIn, nOut],
-            initializer=tf.truncated_normal_initializer(
-                stddev=0.01, dtype=tf.float32),
-            dtype=tf.float32)
-
-        if wd is not None and wd > 0:
-            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
-            tf.add_to_collection('losses', weight_decay)
-
-        biases = tf.get_variable(
-            name + '_b', [nOut],
-            initializer=tf.constant_initializer(
-                value=0.0, dtype=tf.float32),
-            dtype=tf.float32,
-            trainable=True)
-
-        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
-                  tf.matmul(inpOp, kernel) + biases
-
-        output = tf.nn.dropout(affine1, drop) if drop else affine1
-
-        return output
-
-
-def _mpool(name, inpOp, kH, kW, dH, dW):
-    if FLAGS.data_format == 'NCHW':
-        ksize = [1, 1, kH, kW]
-        strides = [1, 1, dH, dW]
-    else:
-        ksize = [1, kH, kW, 1]
-        strides = [1, dH, dW, 1]
-    return tf.nn.max_pool(
-        inpOp,
-        ksize=ksize,
-        strides=strides,
-        padding='VALID',
-        data_format=FLAGS.data_format,
-        name=name)
-
-
-def _norm(name, l_input, lsize=4):
-    return tf.nn.lrn(l_input,
-                     lsize,
-                     bias=1.0,
-                     alpha=0.001 / 9.0,
-                     beta=0.75,
-                     name=name)
-
-
-def loss(logits, labels):
-    labels = tf.cast(labels, tf.int64)
-    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        logits, labels, name='cross_entropy_per_example')
-    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
-    tf.add_to_collection('losses', cross_entropy_mean)
-
-    # The total loss is defined as the cross entropy loss plus all of the weight
-    # decay terms (L2 loss).
-    return tf.add_n(tf.get_collection('losses'), name='total_loss')
-
-
-def get_incoming_shape(incoming):
-    """ Returns the incoming data shape """
-    if isinstance(incoming, tf.Tensor):
-        return incoming.get_shape().as_list()
-    elif type(incoming) in [np.array, list, tuple]:
-        return np.shape(incoming)
-    else:
-        raise Exception("Invalid incoming layer.")
-
-
-def inference(images):
-    conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
-    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
-    norm1 = _norm('norm1', pool1, lsize=5)
-    conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
-    pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
-    norm2 = _norm('norm2', pool2, lsize=5)
-    conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
-    conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
-    conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
-    pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
-    resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
-    affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096, 0.5)
-    affn2 = _affine('fc7', affn1, 4096, 4096, 0.5)
-    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False)  # last fc
-
-    return affn3
-
-
-def time_tensorflow_run(session, target, info_string):
-    num_steps_burn_in = 10
-    total_duration = 0.0
-    total_duration_squared = 0.0
-    if not isinstance(target, list):
-        target = [target]
-    target_op = tf.group(*target)
-    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-        start_time = time.time()
-        _ = session.run(target_op)
-        duration = time.time() - start_time
-        if i > num_steps_burn_in:
-            if not i % 10:
-                print('%s: step %d, duration = %.3f' %
-                      (datetime.now(), i - num_steps_burn_in, duration))
-            total_duration += duration
-            total_duration_squared += duration * duration
-    mn = total_duration / FLAGS.num_batches
-    vr = total_duration_squared / FLAGS.num_batches - mn * mn
-    sd = math.sqrt(vr)
-    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
-          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
-
-
-def _add_loss_summaries(total_loss):
-    """
-  Generates moving average for all losses and associated summaries for
-  visualizing the performance of the network.
-
-  Args:
-    total_loss: Total loss from loss().
-  Returns:
-    loss_averages_op: op for generating moving averages of losses.
-  """
-    # Compute the moving average of all individual losses and the total loss.
-    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
-    losses = tf.get_collection('losses')
-    loss_averages_op = loss_averages.apply(losses + [total_loss])
-
-    # Attach a scalar summary to all individual losses and the total loss; do the
-    # same for the averaged version of the losses.
-    for l in losses + [total_loss]:
-        # Name each loss as '(raw)' and name the moving average version of the loss
-        # as the original loss name.
-        tf.scalar_summary(l.op.name + ' (raw)', l)
-        tf.scalar_summary(l.op.name, loss_averages.average(l))
-
-    return loss_averages_op
-
-
-def run_benchmark():
-    with tf.Graph().as_default():
-        with tf.device('/gpu:0'):
-            # Generate some dummy images.
-            image_size = 224
-            # Note that our padding definition is slightly different the cuda-convnet.
-            # In order to force the model to start with the same activations sizes,
-            # we add 3 to the image_size and employ VALID padding above.
-            if FLAGS.data_format == 'NCHW':
-                image_shape = [
-                    FLAGS.batch_size, 3, image_size + 3, image_size + 3
-                ]
-            else:
-                image_shape = [
-                    FLAGS.batch_size, image_size + 3, image_size + 3, 3
-                ]
-            images = tf.get_variable(
-                'image',
-                image_shape,
-                initializer=tf.truncated_normal_initializer(
-                    stddev=0.1, dtype=tf.float32),
-                dtype=tf.float32,
-                trainable=False)
-
-            labels = tf.get_variable(
-                'label', [FLAGS.batch_size],
-                initializer=tf.constant_initializer(1),
-                dtype=tf.int32,
-                trainable=False)
-
-            # Build a Graph that computes the logits predictions from the
-            # inference model.
-            last_layer = inference(images)
-
-            objective = loss(last_layer, labels)
-            # Compute the gradient with respect to all the parameters.
-
-            # Compute gradients.
-            # opt = tf.train.GradientDescentOptimizer(0.001)
-            opt = tf.train.MomentumOptimizer(0.001, 0.9)
-            grads = opt.compute_gradients(objective)
-            global_step = tf.get_variable(
-                'global_step', [],
-                initializer=tf.constant_initializer(
-                    0.0, dtype=tf.float32),
-                trainable=False,
-                dtype=tf.float32)
-            apply_gradient_op = opt.apply_gradients(
-                grads, global_step=global_step)
-
-            # Track the moving averages of all trainable variables.
-            variable_averages = tf.train.ExponentialMovingAverage(0.9,
-                                                                  global_step)
-            variables_averages_op = variable_averages.apply(
-                tf.trainable_variables())
-
-            # Build an initialization operation.
-            init = tf.initialize_all_variables()
-
-            # Start running operations on the Graph.
-            sess = tf.Session(config=tf.ConfigProto(
-                allow_soft_placement=True,
-                log_device_placement=FLAGS.log_device_placement))
-            sess.run(init)
-
-            run_forward = True
-            run_forward_backward = True
-            if FLAGS.forward_only and FLAGS.forward_backward_only:
-                raise ValueError("Cannot specify --forward_only and "
-                                 "--forward_backward_only at the same time.")
-            if FLAGS.forward_only:
-                run_forward_backward = False
-            elif FLAGS.forward_backward_only:
-                run_forward = False
-
-            if run_forward:
-                time_tensorflow_run(sess, last_layer, "Forward")
-
-            if run_forward_backward:
-                with tf.control_dependencies(
-                    [apply_gradient_op, variables_averages_op]):
-                    train_op = tf.no_op(name='train')
-                time_tensorflow_run(sess, [train_op, objective],
-                                    "Forward-backward")
-
-
-def main(_):
-    run_benchmark()
-
-
-if __name__ == '__main__':
-    tf.app.run()
--- a/benchmark/tensorflow/image/alexnet_multi_gpu.py
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-from datetime import datetime
-import math
-import re
-import time
-
-import tensorflow.python.platform
-import tensorflow as tf
-
-FLAGS = tf.app.flags.FLAGS
-
-tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
-tf.app.flags.DEFINE_string('data_format', 'NCHW',
-                           """The data format for Convnet operations.
-                           Can be either NHWC or NCHW.
-                           """)
-
-tf.app.flags.DEFINE_string('train_dir', '/train_model',
-                           """Directory where to write event logs """
-                           """and checkpoint.""")
-tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
-tf.app.flags.DEFINE_boolean('log_device_placement', False,
-                            """Whether to log device placement.""")
-
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
-NUM_EPOCHS_PER_DECAY = 50
-INITIAL_LEARNING_RATE = 0.1
-LEARNING_RATE_DECAY_FACTOR = 0.1
-TOWER_NAME = 'tower'
-
-
-def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
-    with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(
-            name + '_w', [kH, kW, nIn, nOut],
-            initializer=tf.truncated_normal_initializer(
-                stddev=0.01, dtype=tf.float32),
-            dtype=tf.float32)
-
-        if wd is not None:
-            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
-            tf.add_to_collection('losses', weight_decay)
-
-        if FLAGS.data_format == 'NCHW':
-            strides = [1, 1, dH, dW]
-        else:
-            strides = [1, dH, dW, 1]
-        conv = tf.nn.conv2d(
-            inpOp,
-            kernel,
-            strides,
-            padding=padType,
-            data_format=FLAGS.data_format)
-
-        biases = tf.get_variable(
-            name=name + '_b',
-            shape=[nOut],
-            initializer=tf.constant_initializer(
-                value=0.0, dtype=tf.float32),
-            dtype=tf.float32)
-
-        bias = tf.reshape(
-            tf.nn.bias_add(
-                conv, biases, data_format=FLAGS.data_format),
-            conv.get_shape())
-
-        conv1 = tf.nn.relu(bias, name=scope)
-        return conv1
-
-
-def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
-    with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(
-            name + '_w', [nIn, nOut],
-            initializer=tf.truncated_normal_initializer(
-                stddev=0.01, dtype=tf.float32),
-            dtype=tf.float32)
-
-        if wd is not None:
-            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
-            tf.add_to_collection('losses', weight_decay)
-
-        biases = tf.get_variable(
-            name + '_b', [nOut],
-            initializer=tf.constant_initializer(
-                value=0.0, dtype=tf.float32),
-            dtype=tf.float32,
-            trainable=True)
-
-        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
-                  tf.matmul(inpOp, kernel) + biases
-
-        return affine1
-
-
-def _mpool(name, inpOp, kH, kW, dH, dW):
-    if FLAGS.data_format == 'NCHW':
-        ksize = [1, 1, kH, kW]
-        strides = [1, 1, dH, dW]
-    else:
-        ksize = [1, kH, kW, 1]
-        strides = [1, dH, dW, 1]
-    return tf.nn.max_pool(
-        inpOp,
-        ksize=ksize,
-        strides=strides,
-        padding='VALID',
-        data_format=FLAGS.data_format,
-        name=name)
-
-
-def _norm(name, l_input, lsize=4):
-    return tf.nn.lrn(l_input,
-                     lsize,
-                     bias=1.0,
-                     alpha=0.001 / 9.0,
-                     beta=0.75,
-                     name=name)
-
-
-def loss(logits, labels):
-    labels = tf.cast(labels, tf.int64)
-    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        logits, labels, name='cross_entropy_per_example')
-    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
-    tf.add_to_collection('losses', cross_entropy_mean)
-
-    # The total loss is defined as the cross entropy loss plus all of the weight
-    # decay terms (L2 loss).
-    return tf.add_n(tf.get_collection('losses'), name='total_loss')
-
-
-def get_incoming_shape(incoming):
-    """ Returns the incoming data shape """
-    if isinstance(incoming, tf.Tensor):
-        return incoming.get_shape().as_list()
-    elif type(incoming) in [np.array, list, tuple]:
-        return np.shape(incoming)
-    else:
-        raise Exception("Invalid incoming layer.")
-
-
-def inference(images):
-    conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
-    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
-    norm1 = _norm('norm1', pool1, lsize=5)
-    conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
-    pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
-    norm2 = _norm('norm2', pool2, lsize=5)
-    conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
-    conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
-    conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
-    pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
-    resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
-    affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096)
-    affn2 = _affine('fc7', affn1, 4096, 4096)
-    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False)  # last fc
-
-    return affn3
-
-
-def tower_loss(scope):
-    """Calculate the total loss on a single tower running the model.
-    Args:
-        scope: unique prefix string identifying the tower, e.g. 'tower_0'
-    Returns:
-        Tensor of shape [] containing the total loss for a batch of data
-    """
-    image_size = 224
-    if FLAGS.data_format == 'NCHW':
-        image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3]
-    else:
-        image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3]
-    images = tf.get_variable(
-        'image',
-        image_shape,
-        initializer=tf.truncated_normal_initializer(
-            stddev=0.1, dtype=tf.float32),
-        dtype=tf.float32,
-        trainable=False)
-
-    labels = tf.get_variable(
-        'label', [FLAGS.batch_size],
-        initializer=tf.constant_initializer(1),
-        dtype=tf.int32,
-        trainable=False)
-
-    # Build a Graph that computes the logits predictions from the
-    # inference model.
-    last_layer = inference(images)
-
-    # Build the portion of the Graph calculating the losses. Note that we will
-    # assemble the total_loss using a custom function below.
-    _ = loss(last_layer, labels)
-
-    # Assemble all of the losses for the current tower only.
-    losses = tf.get_collection('losses', scope)
-
-    # Calculate the total loss for the current tower.
-    total_loss = tf.add_n(losses, name='total_loss')
-
-    # Compute the moving average of all individual losses and the total loss.
-    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
-    loss_averages_op = loss_averages.apply(losses + [total_loss])
-
-    # Attach a scalar summary to all individual losses and the total loss; do the
-    # same for the averaged version of the losses.
-    for l in losses + [total_loss]:
-        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
-        # session. This helps the clarity of presentation on tensorboard.
-        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
-        # Name each loss as '(raw)' and name the moving average version of the loss
-        # as the original loss name.
-        tf.scalar_summary(loss_name + ' (raw)', l)
-        tf.scalar_summary(loss_name, loss_averages.average(l))
-
-    with tf.control_dependencies([loss_averages_op]):
-        total_loss = tf.identity(total_loss)
-    return total_loss
-
-
-def average_gradients(tower_grads):
-    """Calculate the average gradient for each shared variable across all towers.
-  Note that this function provides a synchronization point across all towers.
-  Args:
-    tower_grads: List of lists of (gradient, variable) tuples. The outer list
-      is over individual gradients. The inner list is over the gradient
-      calculation for each tower.
-  Returns:
-     List of pairs of (gradient, variable) where the gradient has been averaged
-     across all towers.
-  """
-    average_grads = []
-    for grad_and_vars in zip(*tower_grads):
-        # Note that each grad_and_vars looks like the following:
-        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-        grads = []
-        for g, _ in grad_and_vars:
-            # Add 0 dimension to the gradients to represent the tower.
-            expanded_g = tf.expand_dims(g, 0)
-
-            # Append on a 'tower' dimension which we will average over below.
-            grads.append(expanded_g)
-
-        # Average over the 'tower' dimension.
-        grad = tf.concat(0, grads)
-        grad = tf.reduce_mean(grad, 0)
-
-        # Keep in mind that the Variables are redundant because they are shared
-        # across towers. So .. we will just return the first tower's pointer to
-        # the Variable.
-        v = grad_and_vars[0][1]
-        grad_and_var = (grad, v)
-        average_grads.append(grad_and_var)
-    return average_grads
-
-
-def time_tensorflow_run(session, target):
-    num_steps_burn_in = 50
-    total_duration = 0.0
-    total_duration_squared = 0.0
-    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-        start_time = time.time()
-        _, loss_value = session.run(target)
-        duration = time.time() - start_time
-        if i > num_steps_burn_in:
-            if not i % 10:
-                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
-                examples_per_sec = num_examples_per_step / duration
-                sec_per_batch = duration
-
-                format_str = (
-                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
-                    'sec/batch batch_size = %d)')
-                print(format_str %
-                      (datetime.now(), i - num_steps_burn_in, loss_value,
-                       duration, sec_per_batch, num_examples_per_step))
-
-            total_duration += duration
-            total_duration_squared += duration * duration
-
-    mn = total_duration / FLAGS.num_batches
-    vr = total_duration_squared / FLAGS.num_batches - mn * mn
-    sd = math.sqrt(vr)
-    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
-          (datetime.now(), FLAGS.num_batches, mn, sd))
-
-
-def run_benchmark():
-    with tf.Graph().as_default(), tf.device('/cpu:0'):
-        # Create a variable to count the number of train() calls. This equals the
-        # number of batches processed * FLAGS.num_gpus.
-        global_step = tf.get_variable(
-            'global_step', [],
-            initializer=tf.constant_initializer(0),
-            trainable=False)
-
-        # Calculate the learning rate schedule.
-        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
-                                 FLAGS.batch_size)
-        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
-
-        # Decay the learning rate exponentially based on the number of steps.
-        lr = tf.train.exponential_decay(
-            INITIAL_LEARNING_RATE,
-            global_step,
-            decay_steps,
-            LEARNING_RATE_DECAY_FACTOR,
-            staircase=True)
-
-        # Create an optimizer that performs gradient descent.
-        opt = tf.train.MomentumOptimizer(lr, 0.9)
-
-        # Calculate the gradients for each model tower.
-        tower_grads = []
-        for i in xrange(FLAGS.num_gpus):
-            with tf.device('/gpu:%d' % i):
-                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
-                    # Calculate the loss for one tower of the model. This function
-                    # constructs the entire model but shares the variables across
-                    # all towers.
-                    loss = tower_loss(scope)
-
-                    # Reuse variables for the next tower.
-                    tf.get_variable_scope().reuse_variables()
-
-                    # Retain the summaries from the final tower.
-                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
-
-                    # Calculate the gradients for the batch of data on this tower.
-                    grads = opt.compute_gradients(loss)
-
-                    # Keep track of the gradients across all towers.
-                    tower_grads.append(grads)
-
-        # We must calculate the mean of each gradient. Note that this is the
-        # synchronization point across all towers.
-        grads = average_gradients(tower_grads)
-
-        # Apply the gradients to adjust the shared variables.
-        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-        # Group all updates to into a single train op.
-        train_op = tf.group(apply_gradient_op)
-
-        # Build an initialization operation.
-        init = tf.initialize_all_variables()
-
-        # Start running operations on the Graph. allow_soft_placement must be set to
-        # True to build towers on GPU, as some of the ops do not have GPU
-        # implementations.
-        sess = tf.Session(config=tf.ConfigProto(
-            allow_soft_placement=True,
-            log_device_placement=FLAGS.log_device_placement))
-        sess.run(init)
-        time_tensorflow_run(sess, [train_op, loss])
-
-
-def main(_):
-    run_benchmark()
-
-
-if __name__ == '__main__':
-    tf.app.run()
--- a/benchmark/tensorflow/image/googlenet.py
+++ b/benchmark/tensorflow/image/googlenet.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from six.moves import xrange
-from datetime import datetime
-import math
-import time
-
-import tensorflow.python.platform
-import tensorflow as tf
-
-FLAGS = tf.app.flags.FLAGS
-
-tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
-tf.app.flags.DEFINE_boolean('forward_only', False,
-                            """Only run the forward pass.""")
-tf.app.flags.DEFINE_boolean('forward_backward_only', False,
-                            """Only run the forward-forward pass.""")
-tf.app.flags.DEFINE_string('data_format', 'NCHW',
-                           """The data format for Convnet operations.
-                           Can be either NHWC or NCHW.
-                           """)
-tf.app.flags.DEFINE_boolean('log_device_placement', False,
-                            """Whether to log device placement.""")
-
-parameters = []
-
-conv_counter = 1
-pool_counter = 1
-affine_counter = 1
-
-
-def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
-    global conv_counter
-    global parameters
-    name = 'conv' + str(conv_counter)
-    conv_counter += 1
-    with tf.name_scope(name) as scope:
-        kernel = tf.Variable(
-            tf.truncated_normal(
-                [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
-            name='weights')
-
-        if wd is not None and wd > 0:
-            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
-            tf.add_to_collection('losses', weight_decay)
-
-        if FLAGS.data_format == 'NCHW':
-            strides = [1, 1, dH, dW]
-        else:
-            strides = [1, dH, dW, 1]
-        conv = tf.nn.conv2d(
-            inpOp,
-            kernel,
-            strides,
-            padding=padType,
-            data_format=FLAGS.data_format)
-        biases = tf.Variable(
-            tf.constant(
-                0.0, shape=[nOut], dtype=tf.float32),
-            trainable=True,
-            name='biases')
-        bias = tf.reshape(
-            tf.nn.bias_add(
-                conv, biases, data_format=FLAGS.data_format),
-            conv.get_shape())
-        conv1 = tf.nn.relu(bias, name=scope)
-        parameters += [kernel, biases]
-        return conv1
-
-
-def _affine(inpOp, nIn, nOut, act=True, wd=0.0005):
-    global affine_counter
-    global parameters
-    name = 'affine' + str(affine_counter)
-    affine_counter += 1
-    with tf.name_scope(name) as scope:
-        kernel = tf.Variable(
-            tf.truncated_normal(
-                [nIn, nOut], dtype=tf.float32, stddev=1e-1),
-            name='weights')
-
-        if wd is not None and wd > 0:
-            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
-            tf.add_to_collection('losses', weight_decay)
-
-        biases = tf.Variable(
-            tf.constant(
-                0.0, shape=[nOut], dtype=tf.float32),
-            trainable=True,
-            name='biases')
-        affine1 = tf.nn.relu_layer(
-            inpOp, kernel, biases,
-            name=name) if act else tf.matmul(inpOp, kernel) + biases
-        parameters += [kernel, biases]
-        return affine1
-
-
-def _mpool(inpOp, kH, kW, dH, dW, padding):
-    global pool_counter
-    global parameters
-    name = 'pool' + str(pool_counter)
-    pool_counter += 1
-    if FLAGS.data_format == 'NCHW':
-        ksize = [1, 1, kH, kW]
-        strides = [1, 1, dH, dW]
-    else:
-        ksize = [1, kH, kW, 1]
-        strides = [1, dH, dW, 1]
-    return tf.nn.max_pool(
-        inpOp,
-        ksize=ksize,
-        strides=strides,
-        padding=padding,
-        data_format=FLAGS.data_format,
-        name=name)
-
-
-def _apool(inpOp, kH, kW, dH, dW, padding):
-    global pool_counter
-    global parameters
-    name = 'pool' + str(pool_counter)
-    pool_counter += 1
-    if FLAGS.data_format == 'NCHW':
-        ksize = [1, 1, kH, kW]
-        strides = [1, 1, dH, dW]
-    else:
-        ksize = [1, kH, kW, 1]
-        strides = [1, dH, dW, 1]
-    return tf.nn.avg_pool(
-        inpOp,
-        ksize=ksize,
-        strides=strides,
-        padding=padding,
-        data_format=FLAGS.data_format,
-        name=name)
-
-
-def _inception(inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
-    conv1 = _conv(inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
-
-    conv3_ = _conv(inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
-    conv3 = _conv(conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
-
-    conv5_ = _conv(inp, inSize, o3s1, 1, 1, 1, 1, 'VALID')
-    conv5 = _conv(conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME')
-
-    pool_ = _mpool(inp, o4s1, o4s1, 1, 1, 'SAME')
-    pool = _conv(pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
-
-    if FLAGS.data_format == 'NCHW':
-        channel_dim = 1
-    else:
-        channel_dim = 3
-    incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
-    return incept
-
-
-def loss(logits, labels):
-    batch_size = tf.size(labels)
-    labels = tf.expand_dims(labels, 1)
-    indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
-    concated = tf.concat(1, [indices, labels])
-    onehot_labels = tf.sparse_to_dense(concated,
-                                       tf.pack([batch_size, 1000]), 1.0, 0.0)
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
-        logits, onehot_labels, name='xentropy')
-    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
-    return loss
-
-
-def inference(images):
-    # stage 1
-    conv1 = _conv(images, 3, 64, 7, 7, 2, 2, 'SAME')
-    pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
-    # stage 2
-    conv2 = _conv(pool1, 64, 64, 1, 1, 1, 1, 'VALID')
-    conv3 = _conv(conv2, 64, 192, 3, 3, 1, 1, 'SAME')
-    pool3 = _mpool(conv3, 3, 3, 2, 2, 'SAME')
-
-    # stage 3
-    incept3a = _inception(pool3, 192, 64, 96, 128, 16, 32, 3, 32)
-    incept3b = _inception(incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
-    pool4 = _mpool(incept3b, 3, 3, 2, 2, 'SAME')
-
-    # stage 4
-    incept4a = _inception(pool4, 480, 192, 96, 208, 16, 48, 3, 64)
-    incept4b = _inception(incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
-    incept4c = _inception(incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
-    incept4d = _inception(incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
-    incept4e = _inception(incept4d, 528, 256, 160, 320, 32, 128, 3, 128)
-    pool5 = _mpool(incept4e, 3, 3, 2, 2, 'SAME')
-
-    # stage 5
-    incept5a = _inception(pool5, 832, 256, 160, 320, 32, 128, 3, 128)
-    incept5b = _inception(incept5a, 832, 384, 192, 384, 48, 128, 3, 128)
-    pool6 = _apool(incept5b, 7, 7, 1, 1, 'VALID')
-
-    # output 1
-    resh1 = tf.reshape(pool6, [-1, 1024])
-    drop = tf.nn.dropout(resh1, 0.4)
-    affn1 = _affine(resh1, 1024, 1000, act=False)
-
-    return affn1
-
-
-def time_tensorflow_run(session, target, info_string):
-    num_steps_burn_in = 10
-    total_duration = 0.0
-    total_duration_squared = 0.0
-    if not isinstance(target, list):
-        target = [target]
-    target_op = tf.group(*target)
-    for i in range(FLAGS.num_batches + num_steps_burn_in):
-        start_time = time.time()
-        _ = session.run(target_op)
-        duration = time.time() - start_time
-        if i > num_steps_burn_in:
-            if not i % 10:
-                print('%s: step %d, duration = %.3f' %
-                      (datetime.now(), i - num_steps_burn_in, duration))
-            total_duration += duration
-            total_duration_squared += duration * duration
-    mn = total_duration / FLAGS.num_batches
-    vr = total_duration_squared / FLAGS.num_batches - mn * mn
-    sd = math.sqrt(vr)
-    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
-          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
-
-
-def run_benchmark():
-    global parameters
-    with tf.Graph().as_default():
-        # Generate some dummy images.
-        image_size = 224
-        if FLAGS.data_format == 'NCHW':
-            image_shape = [FLAGS.batch_size, 3, image_size, image_size]
-        else:
-            image_shape = [FLAGS.batch_size, image_size, image_size, 3]
-
-        images = tf.get_variable(
-            'image',
-            image_shape,
-            initializer=tf.truncated_normal_initializer(
-                stddev=0.1, dtype=tf.float32),
-            dtype=tf.float32,
-            trainable=False)
-
-        labels = tf.get_variable(
-            'label', [FLAGS.batch_size],
-            initializer=tf.constant_initializer(1),
-            dtype=tf.int32,
-            trainable=False)
-
-        # Build a Graph that computes the logits predictions from the
-        # inference model.
-        last_layer = inference(images)
-
-        objective = loss(last_layer, labels)
-
-        # Compute gradients.
-        # opt = tf.train.GradientDescentOptimizer(0.001)
-        opt = tf.train.MomentumOptimizer(0.001, 0.9)
-        grads = opt.compute_gradients(objective)
-        global_step = tf.get_variable(
-            'global_step', [],
-            initializer=tf.constant_initializer(
-                0.0, dtype=tf.float32),
-            trainable=False,
-            dtype=tf.float32)
-        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-        # Track the moving averages of all trainable variables.
-        variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
-        variables_averages_op = variable_averages.apply(tf.trainable_variables(
-        ))
-
-        # Build an initialization operation.
-        init = tf.initialize_all_variables()
-
-        # Start running operations on the Graph.
-        sess = tf.Session(config=tf.ConfigProto(
-            allow_soft_placement=True,
-            log_device_placement=FLAGS.log_device_placement))
-        sess.run(init)
-
-        run_forward = True
-        run_forward_backward = True
-        if FLAGS.forward_only and FLAGS.forward_backward_only:
-            raise ValueError("Cannot specify --forward_only and "
-                             "--forward_backward_only at the same time.")
-        if FLAGS.forward_only:
-            run_forward_backward = False
-        elif FLAGS.forward_backward_only:
-            run_forward = False
-
-        if run_forward:
-            # Run the forward benchmark.
-            time_tensorflow_run(sess, last_layer, "Forward")
-
-        if run_forward_backward:
-            with tf.control_dependencies(
-                [apply_gradient_op, variables_averages_op]):
-                train_op = tf.no_op(name='train')
-            time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
-
-
-def main(_):
-    run_benchmark()
-
-
-if __name__ == '__main__':
-    tf.app.run()
--- a/benchmark/tensorflow/image/googlenet_multi_gpu.py
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-from datetime import datetime
-import math
-import re
-import time
-
-import tensorflow.python.platform
-import tensorflow as tf
-
-FLAGS = tf.app.flags.FLAGS
-
-tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
-tf.app.flags.DEFINE_string('data_format', 'NCHW',
-                           """The data format for Convnet operations.
-                           Can be either NHWC or NCHW.
-                           """)
-
-tf.app.flags.DEFINE_string('train_dir', '/train_model',
-                           """Directory where to write event logs """
-                           """and checkpoint.""")
-tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
-tf.app.flags.DEFINE_boolean('log_device_placement', False,
-                            """Whether to log device placement.""")
-
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
-NUM_EPOCHS_PER_DECAY = 50
-INITIAL_LEARNING_RATE = 0.1
-LEARNING_RATE_DECAY_FACTOR = 0.1
-TOWER_NAME = 'tower'
-
-
-def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
-    with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(
-            name + '_w', [kH, kW, nIn, nOut],
-            initializer=tf.truncated_normal_initializer(
-                stddev=0.01, dtype=tf.float32),
-            dtype=tf.float32)
-
-        if wd is not None:
-            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
-            tf.add_to_collection('losses', weight_decay)
-
-        if FLAGS.data_format == 'NCHW':
-            strides = [1, 1, dH, dW]
-        else:
-            strides = [1, dH, dW, 1]
-        conv = tf.nn.conv2d(
-            inpOp,
-            kernel,
-            strides,
-            padding=padType,
-            data_format=FLAGS.data_format)
-
-        biases = tf.get_variable(
-            name=name + '_b',
-            shape=[nOut],
-            initializer=tf.constant_initializer(
-                value=0.0, dtype=tf.float32),
-            dtype=tf.float32)
-
-        bias = tf.reshape(
-            tf.nn.bias_add(
-                conv, biases, data_format=FLAGS.data_format),
-            conv.get_shape())
-
-        conv1 = tf.nn.relu(bias, name=scope)
-        return conv1
-
-
-def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
-    with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(
-            name + '_w', [nIn, nOut],
-            initializer=tf.truncated_normal_initializer(
-                stddev=0.01, dtype=tf.float32),
-            dtype=tf.float32)
-
-        if wd is not None:
-            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
-            tf.add_to_collection('losses', weight_decay)
-
-        biases = tf.get_variable(
-            name + '_b', [nOut],
-            initializer=tf.constant_initializer(
-                value=0.0, dtype=tf.float32),
-            dtype=tf.float32,
-            trainable=True)
-
-        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
-                  tf.matmul(inpOp, kernel) + biases
-
-        return affine1
-
-
-def _mpool(name, inpOp, kH, kW, dH, dW, padding):
-    if FLAGS.data_format == 'NCHW':
-        ksize = [1, 1, kH, kW]
-        strides = [1, 1, dH, dW]
-    else:
-        ksize = [1, kH, kW, 1]
-        strides = [1, dH, dW, 1]
-    return tf.nn.max_pool(
-        inpOp,
-        ksize=ksize,
-        strides=strides,
-        padding=padding,
-        data_format=FLAGS.data_format,
-        name=name)
-
-
-def _apool(name, inpOp, kH, kW, dH, dW, padding):
-    if FLAGS.data_format == 'NCHW':
-        ksize = [1, 1, kH, kW]
-        strides = [1, 1, dH, dW]
-    else:
-        ksize = [1, kH, kW, 1]
-        strides = [1, dH, dW, 1]
-    return tf.nn.avg_pool(
-        inpOp,
-        ksize=ksize,
-        strides=strides,
-        padding=padding,
-        data_format=FLAGS.data_format,
-        name=name)
-
-
-def loss(logits, labels):
-    labels = tf.cast(labels, tf.int64)
-    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        logits, labels, name='cross_entropy_per_example')
-    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
-    tf.add_to_collection('losses', cross_entropy_mean)
-
-    # The total loss is defined as the cross entropy loss plus all of the weight
-    # decay terms (L2 loss).
-    return tf.add_n(tf.get_collection('losses'), name='total_loss')
-
-
-def get_incoming_shape(incoming):
-    """ Returns the incoming data shape """
-    if isinstance(incoming, tf.Tensor):
-        return incoming.get_shape().as_list()
-    elif type(incoming) in [np.array, list, tuple]:
-        return np.shape(incoming)
-    else:
-        raise Exception("Invalid incoming layer.")
-
-
-def _inception(name, inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
-    conv1 = _conv(name + '_1', inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
-
-    conv3_ = _conv(name + '_3r', inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
-    conv3 = _conv(name + '_3', conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
-
-    conv5_ = _conv(name + '_5r', inp, inSize, o3s1, 1, 1, 1, 1, 'VALID')
-    conv5 = _conv(name + '5', conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME')
-
-    pool_ = _mpool(name + 'pool', inp, o4s1, o4s1, 1, 1, 'SAME')
-    pool = _conv(name + 'proj', pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
-
-    if FLAGS.data_format == 'NCHW':
-        channel_dim = 1
-    else:
-        channel_dim = 3
-    incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
-    return incept
-
-
-def inference(images):
-    # stage 1
-    conv1 = _conv('conv1', images, 3, 64, 7, 7, 2, 2, 'SAME')
-    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2, 'SAME')
-
-    # stage 2
-    conv2 = _conv('conv2', pool1, 64, 64, 1, 1, 1, 1, 'VALID')
-    conv3 = _conv('conv3', conv2, 64, 192, 3, 3, 1, 1, 'SAME')
-    pool3 = _mpool('pool3', conv3, 3, 3, 2, 2, 'SAME')
-
-    # stage 3
-    incept3a = _inception('ince3a', pool3, 192, 64, 96, 128, 16, 32, 3, 32)
-    incept3b = _inception('ince3b', incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
-    pool4 = _mpool('pool4', incept3b, 3, 3, 2, 2, 'SAME')
-
-    # stage 4
-    incept4a = _inception('ince4a', pool4, 480, 192, 96, 208, 16, 48, 3, 64)
-    incept4b = _inception('ince4b', incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
-    incept4c = _inception('ince4c', incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
-    incept4d = _inception('ince4d', incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
-    incept4e = _inception('ince4e', incept4d, 528, 256, 160, 320, 32, 128, 3,
-                          128)
-    pool5 = _mpool('pool5', incept4e, 3, 3, 2, 2, 'SAME')
-
-    # stage 5
-    incept5a = _inception('ince5a', pool5, 832, 256, 160, 320, 32, 128, 3, 128)
-    incept5b = _inception('ince5b', incept5a, 832, 384, 192, 384, 48, 128, 3,
-                          128)
-    pool6 = _apool('pool6', incept5b, 7, 7, 1, 1, 'VALID')
-
-    # output 1
-    resh1 = tf.reshape(pool6, [-1, 1024])
-    drop = tf.nn.dropout(resh1, 0.4)
-    affn1 = _affine('fc_out', resh1, 1024, 1000, act=False)
-
-    return affn1
-
-
-def tower_loss(scope):
-    """Calculate the total loss on a single tower running the model.
-    Args:
-        scope: unique prefix string identifying the tower, e.g. 'tower_0'
-    Returns:
-        Tensor of shape [] containing the total loss for a batch of data
-    """
-    image_size = 224
-    if FLAGS.data_format == 'NCHW':
-        image_shape = [FLAGS.batch_size, 3, image_size, image_size]
-    else:
-        image_shape = [FLAGS.batch_size, image_size, image_size, 3]
-    images = tf.get_variable(
-        'image',
-        image_shape,
-        initializer=tf.truncated_normal_initializer(
-            stddev=0.1, dtype=tf.float32),
-        dtype=tf.float32,
-        trainable=False)
-
-    labels = tf.get_variable(
-        'label', [FLAGS.batch_size],
-        initializer=tf.constant_initializer(1),
-        dtype=tf.int32,
-        trainable=False)
-
-    # Build a Graph that computes the logits predictions from the
-    # inference model.
-    last_layer = inference(images)
-
-    # Build the portion of the Graph calculating the losses. Note that we will
-    # assemble the total_loss using a custom function below.
-    _ = loss(last_layer, labels)
-
-    # Assemble all of the losses for the current tower only.
-    losses = tf.get_collection('losses', scope)
-
-    # Calculate the total loss for the current tower.
-    total_loss = tf.add_n(losses, name='total_loss')
-
-    # Compute the moving average of all individual losses and the total loss.
-    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
-    loss_averages_op = loss_averages.apply(losses + [total_loss])
-
-    # Attach a scalar summary to all individual losses and the total loss; do the
-    # same for the averaged version of the losses.
-    for l in losses + [total_loss]:
-        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
-        # session. This helps the clarity of presentation on tensorboard.
-        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
-        # Name each loss as '(raw)' and name the moving average version of the loss
-        # as the original loss name.
-        tf.scalar_summary(loss_name + ' (raw)', l)
-        tf.scalar_summary(loss_name, loss_averages.average(l))
-
-    with tf.control_dependencies([loss_averages_op]):
-        total_loss = tf.identity(total_loss)
-    return total_loss
-
-
-def average_gradients(tower_grads):
-    """Calculate the average gradient for each shared variable across all towers.
-  Note that this function provides a synchronization point across all towers.
-  Args:
-    tower_grads: List of lists of (gradient, variable) tuples. The outer list
-      is over individual gradients. The inner list is over the gradient
-      calculation for each tower.
-  Returns:
-     List of pairs of (gradient, variable) where the gradient has been averaged
-     across all towers.
-  """
-    average_grads = []
-    for grad_and_vars in zip(*tower_grads):
-        # Note that each grad_and_vars looks like the following:
-        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-        grads = []
-        for g, _ in grad_and_vars:
-            # Add 0 dimension to the gradients to represent the tower.
-            expanded_g = tf.expand_dims(g, 0)
-
-            # Append on a 'tower' dimension which we will average over below.
-            grads.append(expanded_g)
-
-        # Average over the 'tower' dimension.
-        grad = tf.concat(0, grads)
-        grad = tf.reduce_mean(grad, 0)
-
-        # Keep in mind that the Variables are redundant because they are shared
-        # across towers. So .. we will just return the first tower's pointer to
-        # the Variable.
-        v = grad_and_vars[0][1]
-        grad_and_var = (grad, v)
-        average_grads.append(grad_and_var)
-    return average_grads
-
-
-def time_tensorflow_run(session, target):
-    num_steps_burn_in = 50
-    total_duration = 0.0
-    total_duration_squared = 0.0
-    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-        start_time = time.time()
-        _, loss_value = session.run(target)
-        duration = time.time() - start_time
-        if i > num_steps_burn_in:
-            if not i % 10:
-                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
-                examples_per_sec = num_examples_per_step / duration
-                sec_per_batch = duration
-
-                format_str = (
-                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
-                    'sec/batch batch_size = %d)')
-                print(format_str %
-                      (datetime.now(), i - num_steps_burn_in, loss_value,
-                       duration, sec_per_batch, num_examples_per_step))
-
-            total_duration += duration
-            total_duration_squared += duration * duration
-
-    mn = total_duration / FLAGS.num_batches
-    vr = total_duration_squared / FLAGS.num_batches - mn * mn
-    sd = math.sqrt(vr)
-    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
-          (datetime.now(), FLAGS.num_batches, mn, sd))
-
-
-def run_benchmark():
-    with tf.Graph().as_default(), tf.device('/cpu:0'):
-        # Create a variable to count the number of train() calls. This equals the
-        # number of batches processed * FLAGS.num_gpus.
-        global_step = tf.get_variable(
-            'global_step', [],
-            initializer=tf.constant_initializer(0),
-            trainable=False)
-
-        # Calculate the learning rate schedule.
-        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
-                                 FLAGS.batch_size)
-        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
-
-        # Decay the learning rate exponentially based on the number of steps.
-        lr = tf.train.exponential_decay(
-            INITIAL_LEARNING_RATE,
-            global_step,
-            decay_steps,
-            LEARNING_RATE_DECAY_FACTOR,
-            staircase=True)
-
-        # Create an optimizer that performs gradient descent.
-        opt = tf.train.MomentumOptimizer(lr, 0.9)
-
-        # Calculate the gradients for each model tower.
-        tower_grads = []
-        for i in xrange(FLAGS.num_gpus):
-            with tf.device('/gpu:%d' % i):
-                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
-                    # Calculate the loss for one tower of the model. This function
-                    # constructs the entire model but shares the variables across
-                    # all towers.
-                    loss = tower_loss(scope)
-
-                    # Reuse variables for the next tower.
-                    tf.get_variable_scope().reuse_variables()
-
-                    # Retain the summaries from the final tower.
-                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
-
-                    # Calculate the gradients for the batch of data on this tower.
-                    grads = opt.compute_gradients(loss)
-
-                    # Keep track of the gradients across all towers.
-                    tower_grads.append(grads)
-
-        # We must calculate the mean of each gradient. Note that this is the
-        # synchronization point across all towers.
-        grads = average_gradients(tower_grads)
-
-        # Apply the gradients to adjust the shared variables.
-        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-        # Group all updates to into a single train op.
-        train_op = tf.group(apply_gradient_op)
-
-        # Build an initialization operation.
-        init = tf.initialize_all_variables()
-
-        # Start running operations on the Graph. allow_soft_placement must be set to
-        # True to build towers on GPU, as some of the ops do not have GPU
-        # implementations.
-        sess = tf.Session(config=tf.ConfigProto(
-            allow_soft_placement=True,
-            log_device_placement=FLAGS.log_device_placement))
-        sess.run(init)
-        time_tensorflow_run(sess, [train_op, loss])
-
-
-def main(_):
-    run_benchmark()
-
-
-if __name__ == '__main__':
-    tf.app.run()
--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
-#!/bin/bash
-
-set -e
-
-function test() {
-  cfg=$1
-  batch_size=$2
-  prefix=$3
-  python $cfg --batch_size=$batch_size > logs/${prefix}-1gpu-${batch_size}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# alexnet
-test alexnet.py 64 alexnet
-test alexnet.py 128 alexnet
-test alexnet.py 256 alexnet
-test alexnet.py 512 alexnet
-
-# googlenet
-test googlenet.py 64 googlenet
-test googlenet.py 128 googlenet
-
-# smallnet 
-test smallnet_mnist_cifar.py 64 smallnet
-test smallnet_mnist_cifar.py 128 smallnet
-test smallnet_mnist_cifar.py 256 smallnet
-test smallnet_mnist_cifar.py 512 smallnet
--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
-#!/bin/bash
-
-set -e
-
-function test() {
-  cfg=$1
-  num_gpu=$2
-  batch_size=$3
-  batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
-  prefix=$4
-  python $cfg --num_gpus=$num_gpu --batch_size=${batch_per_gpu} > logs/${prefix}-4gpu-${batch_size}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# alexnet
-test alexnet_multi_gpu.py 4 512 alexnet
-test alexnet_multi_gpu.py 4 1024 alexnet
-
-# googlenet 
-test googlenet_multi_gpu.py 4 512 alexnet
-test googlenet_multi_gpu.py 4 1024 alexnet
--- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-from datetime import datetime
-import math
-import time
-
-import tensorflow.python.platform
-import tensorflow as tf
-
-FLAGS = tf.app.flags.FLAGS
-
-tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
-tf.app.flags.DEFINE_boolean('forward_only', False,
-                            """Only run the forward pass.""")
-tf.app.flags.DEFINE_boolean('forward_backward_only', False,
-                            """Only run the forward-forward pass.""")
-tf.app.flags.DEFINE_string('data_format', 'NCHW',
-                           """The data format for Convnet operations.
-                           Can be either NHWC or NCHW.
-                           """)
-tf.app.flags.DEFINE_boolean('log_device_placement', False,
-                            """Whether to log device placement.""")
-
-parameters = []
-
-conv_counter = 1
-pool_counter = 1
-affine_counter = 1
-
-
-def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005, act=True):
-    global conv_counter
-    global parameters
-    name = 'conv' + str(conv_counter)
-    conv_counter += 1
-    with tf.name_scope(name) as scope:
-        kernel = tf.Variable(
-            tf.truncated_normal(
-                [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
-            name='weights')
-
-        if wd is not None:
-            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
-            tf.add_to_collection('losses', weight_decay)
-
-        if FLAGS.data_format == 'NCHW':
-            strides = [1, 1, dH, dW]
-        else:
-            strides = [1, dH, dW, 1]
-        conv = tf.nn.conv2d(
-            inpOp,
-            kernel,
-            strides,
-            padding=padType,
-            data_format=FLAGS.data_format)
-        biases = tf.Variable(
-            tf.constant(
-                0.0, shape=[nOut], dtype=tf.float32),
-            trainable=True,
-            name='biases')
-        bias = tf.reshape(
-            tf.nn.bias_add(
-                conv, biases, data_format=FLAGS.data_format),
-            conv.get_shape())
-
-        conv1 = tf.nn.relu(bias, name=scope) if act else bias
-
-        parameters += [kernel, biases]
-
-        return conv1
-
-
-def _affine(inpOp, nIn, nOut, wd=None, act=True):
-    global affine_counter
-    global parameters
-    name = 'affine' + str(affine_counter)
-    affine_counter += 1
-    with tf.name_scope(name) as scope:
-        kernel = tf.Variable(
-            tf.truncated_normal(
-                [nIn, nOut], dtype=tf.float32, stddev=1e-1),
-            name='weights')
-
-        if wd is not None:
-            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
-            tf.add_to_collection('losses', weight_decay)
-
-        biases = tf.Variable(
-            tf.constant(
-                0.0, shape=[nOut], dtype=tf.float32),
-            trainable=True,
-            name='biases')
-
-        affine1 = tf.nn.relu_layer(
-            inpOp, kernel, biases,
-            name=name) if act else tf.matmul(inpOp, kernel) + biases
-
-        parameters += [kernel, biases]
-
-        return affine1
-
-
-def _mpool(inpOp, kH, kW, dH, dW, padding):
-    global pool_counter
-    global parameters
-    name = 'pool' + str(pool_counter)
-    pool_counter += 1
-    if FLAGS.data_format == 'NCHW':
-        ksize = [1, 1, kH, kW]
-        strides = [1, 1, dH, dW]
-    else:
-        ksize = [1, kH, kW, 1]
-        strides = [1, dH, dW, 1]
-    return tf.nn.max_pool(
-        inpOp,
-        ksize=ksize,
-        strides=strides,
-        padding=padding,
-        data_format=FLAGS.data_format,
-        name=name)
-
-
-def _apool(inpOp, kH, kW, dH, dW, padding):
-    global pool_counter
-    global parameters
-    name = 'pool' + str(pool_counter)
-    pool_counter += 1
-    if FLAGS.data_format == 'NCHW':
-        ksize = [1, 1, kH, kW]
-        strides = [1, 1, dH, dW]
-    else:
-        ksize = [1, kH, kW, 1]
-        strides = [1, dH, dW, 1]
-    return tf.nn.avg_pool(
-        inpOp,
-        ksize=ksize,
-        strides=strides,
-        padding=padding,
-        data_format=FLAGS.data_format,
-        name=name)
-
-
-def _norm(name, l_input, lsize=4):
-    return tf.nn.lrn(l_input,
-                     lsize,
-                     bias=1.0,
-                     alpha=0.001 / 9.0,
-                     beta=0.75,
-                     name=name)
-
-
-def loss(logits, labels):
-    batch_size = tf.size(labels)
-    labels = tf.expand_dims(labels, 1)
-    indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
-    concated = tf.concat(1, [indices, labels])
-    onehot_labels = tf.sparse_to_dense(concated,
-                                       tf.pack([batch_size, 10]), 1.0, 0.0)
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
-        logits, onehot_labels, name='xentropy')
-    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
-    return loss
-
-
-def get_incoming_shape(incoming):
-    """ Returns the incoming data shape """
-    if isinstance(incoming, tf.Tensor):
-        return incoming.get_shape().as_list()
-    elif type(incoming) in [np.array, list, tuple]:
-        return np.shape(incoming)
-    else:
-        raise Exception("Invalid incoming layer.")
-
-
-def inference(images):
-    conv1 = _conv(images, 3, 32, 5, 5, 1, 1, 'SAME')
-    pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
-    conv2 = _conv(pool1, 32, 32, 5, 5, 1, 1, 'SAME')
-    pool2 = _apool(conv2, 3, 3, 2, 2, 'SAME')
-    conv3 = _conv(pool2, 32, 64, 5, 5, 1, 1, 'SAME')
-    pool3 = _apool(conv3, 3, 3, 2, 2, 'SAME')
-    resh1 = tf.reshape(pool3, [-1, 64 * 4 * 4])
-    affn1 = _affine(resh1, 64 * 4 * 4, 64)
-    affn2 = _affine(affn1, 64, 10, act=False)
-
-    print('conv1:', get_incoming_shape(conv1))
-    print('pool1:', get_incoming_shape(pool1))
-    print('conv2:', get_incoming_shape(conv2))
-    print('pool2:', get_incoming_shape(pool2))
-    print('conv3:', get_incoming_shape(conv3))
-    print('pool3:', get_incoming_shape(pool3))
-
-    return affn2
-
-
-def time_tensorflow_run(session, target, info_string):
-    num_steps_burn_in = 10
-    total_duration = 0.0
-    total_duration_squared = 0.0
-    if not isinstance(target, list):
-        target = [target]
-    target_op = tf.group(*target)
-    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-        start_time = time.time()
-        _ = session.run(target_op)
-        duration = time.time() - start_time
-        if i > num_steps_burn_in:
-            if not i % 10:
-                print('%s: step %d, duration = %.3f' %
-                      (datetime.now(), i - num_steps_burn_in, duration))
-            total_duration += duration
-            total_duration_squared += duration * duration
-    mn = total_duration / FLAGS.num_batches
-    vr = total_duration_squared / FLAGS.num_batches - mn * mn
-    sd = math.sqrt(vr)
-    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
-          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
-
-
-def run_benchmark():
-    global parameters
-    with tf.Graph().as_default():
-        # Generate some dummy images.
-        image_size = 32
-        # Note that our padding definition is slightly different the cuda-convnet.
-        # In order to force the model to start with the same activations sizes,
-        # we add 3 to the image_size and employ VALID padding above.
-        if FLAGS.data_format == 'NCHW':
-            image_shape = [FLAGS.batch_size, 3, image_size, image_size]
-        else:
-            image_shape = [FLAGS.batch_size, image_size, image_size, 3]
-
-        images = tf.get_variable(
-            'image',
-            image_shape,
-            initializer=tf.truncated_normal_initializer(
-                stddev=0.1, dtype=tf.float32),
-            dtype=tf.float32,
-            trainable=False)
-
-        labels = tf.get_variable(
-            'label', [FLAGS.batch_size],
-            initializer=tf.constant_initializer(1),
-            dtype=tf.int32,
-            trainable=False)
-
-        # Build a Graph that computes the logits predictions from the
-        # inference model.
-        last_layer = inference(images)
-
-        objective = loss(last_layer, labels)
-
-        # Compute gradients.
-        opt = tf.train.MomentumOptimizer(0.001, 0.9)
-        grads = opt.compute_gradients(objective)
-        global_step = tf.get_variable(
-            'global_step', [],
-            initializer=tf.constant_initializer(
-                0.0, dtype=tf.float32),
-            trainable=False,
-            dtype=tf.float32)
-        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-        # Track the moving averages of all trainable variables.
-        variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
-        variables_averages_op = variable_averages.apply(tf.trainable_variables(
-        ))
-
-        # Build an initialization operation.
-        init = tf.initialize_all_variables()
-
-        # Start running operations on the Graph.
-        sess = tf.Session(config=tf.ConfigProto(
-            allow_soft_placement=True,
-            log_device_placement=FLAGS.log_device_placement))
-        sess.run(init)
-
-        run_forward = True
-        run_forward_backward = True
-        if FLAGS.forward_only and FLAGS.forward_backward_only:
-            raise ValueError("Cannot specify --forward_only and "
-                             "--forward_backward_only at the same time.")
-        if FLAGS.forward_only:
-            run_forward_backward = False
-        elif FLAGS.forward_backward_only:
-            run_forward = False
-
-        if run_forward:
-            # Run the forward benchmark.
-            time_tensorflow_run(sess, last_layer, "Forward")
-
-        if run_forward_backward:
-            with tf.control_dependencies(
-                [apply_gradient_op, variables_averages_op]):
-                train_op = tf.no_op(name='train')
-            time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
-
-
-def main(_):
-    run_benchmark()
-
-
-if __name__ == '__main__':
-    tf.app.run()
--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-from tensorflow.python.framework import dtypes
-from tensorflow.python.layers.core import Dense
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import rnn_cell_impl
-from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell
-from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
-from tensorflow.python.ops import array_ops
-from tensorflow.python.util import nest
-import tensorflow.contrib.seq2seq as seq2seq
-from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
-import numpy as np
-import os
-import argparse
-import time
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--embedding_dim",
-    type=int,
-    default=512,
-    help="The dimension of embedding table. (default: %(default)d)")
-parser.add_argument(
-    "--encoder_size",
-    type=int,
-    default=512,
-    help="The size of encoder bi-rnn unit. (default: %(default)d)")
-parser.add_argument(
-    "--decoder_size",
-    type=int,
-    default=512,
-    help="The size of decoder rnn unit. (default: %(default)d)")
-parser.add_argument(
-    "--batch_size",
-    type=int,
-    default=128,
-    help="The sequence number of a mini-batch data. (default: %(default)d)")
-parser.add_argument(
-    "--dict_size",
-    type=int,
-    default=30000,
-    help="The dictionary capacity. Dictionaries of source sequence and "
-    "target dictionary have same capacity. (default: %(default)d)")
-parser.add_argument(
-    "--max_time_steps",
-    type=int,
-    default=81,
-    help="Max number of time steps for sequence. (default: %(default)d)")
-parser.add_argument(
-    "--pass_num",
-    type=int,
-    default=10,
-    help="The pass number to train. (default: %(default)d)")
-parser.add_argument(
-    "--learning_rate",
-    type=float,
-    default=0.0002,
-    help="Learning rate used to train the model. (default: %(default)f)")
-parser.add_argument(
-    "--infer_only", action='store_true', help="If set, run forward only.")
-parser.add_argument(
-    "--beam_size",
-    type=int,
-    default=3,
-    help="The width for beam searching. (default: %(default)d)")
-parser.add_argument(
-    "--max_generation_length",
-    type=int,
-    default=250,
-    help="The maximum length of sequence when doing generation. "
-    "(default: %(default)d)")
-parser.add_argument(
-    "--save_freq",
-    type=int,
-    default=500,
-    help="Save model checkpoint every this interation. (default: %(default)d)")
-parser.add_argument(
-    "--model_dir",
-    type=str,
-    default='./checkpoint',
-    help="Path to save model checkpoints. (default: %(default)d)")
-
-_Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
-
-START_TOKEN_IDX = 0
-END_TOKEN_IDX = 1
-
-
-class LSTMCellWithSimpleAttention(RNNCell):
-    """Add attention mechanism to BasicLSTMCell.
-    This class is a wrapper based on tensorflow's `BasicLSTMCell`.
-    """
-
-    def __init__(self,
-                 num_units,
-                 encoder_vector,
-                 encoder_proj,
-                 source_sequence_length,
-                 forget_bias=1.0,
-                 state_is_tuple=True,
-                 activation=None,
-                 reuse=None):
-        super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse)
-        if not state_is_tuple:
-            logging.warn("%s: Using a concatenated state is slower and will "
-                         "soon be deprecated. Use state_is_tuple=True.", self)
-        self._num_units = num_units
-        # set padding part to 0
-        self._encoder_vector = self._reset_padding(encoder_vector,
-                                                   source_sequence_length)
-        self._encoder_proj = self._reset_padding(encoder_proj,
-                                                 source_sequence_length)
-        self._forget_bias = forget_bias
-        self._state_is_tuple = state_is_tuple
-        self._activation = activation or math_ops.tanh
-        self._linear = None
-
-    @property
-    def state_size(self):
-        return (LSTMStateTuple(self._num_units, self._num_units) \
-                if self._state_is_tuple else 2 * self._num_units)
-
-    @property
-    def output_size(self):
-        return self._num_units
-
-    def zero_state(self, batch_size, dtype):
-        state_size = self.state_size
-        if hasattr(self, "_last_zero_state"):
-            (last_state_size, last_batch_size, last_dtype,
-             last_output) = getattr(self, "_last_zero_state")
-            if (last_batch_size == batch_size and last_dtype == dtype and
-                    last_state_size == state_size):
-                return last_output
-        with ops.name_scope(
-                type(self).__name__ + "ZeroState", values=[batch_size]):
-            output = _zero_state_tensors(state_size, batch_size, dtype)
-        self._last_zero_state = (state_size, batch_size, dtype, output)
-        return output
-
-    def call(self, inputs, state):
-        sigmoid = math_ops.sigmoid
-        # Parameters of gates are concatenated into one multiply for efficiency.
-        if self._state_is_tuple:
-            c, h = state
-        else:
-            c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
-
-        # get context from encoder outputs
-        context = self._simple_attention(self._encoder_vector,
-                                         self._encoder_proj, h)
-
-        if self._linear is None:
-            self._linear = _Linear([inputs, context, h], 4 * self._num_units,
-                                   True)
-        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-        i, j, f, o = array_ops.split(
-            value=self._linear([inputs, context, h]),
-            num_or_size_splits=4,
-            axis=1)
-
-        new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
-                 self._activation(j))
-        new_h = self._activation(new_c) * sigmoid(o)
-
-        if self._state_is_tuple:
-            new_state = LSTMStateTuple(new_c, new_h)
-        else:
-            new_state = array_ops.concat([new_c, new_h], 1)
-        return new_h, new_state
-
-    def _simple_attention(self, encoder_vec, encoder_proj, decoder_state):
-        """Implement the attention function.
-        The implementation has the same logic to the fluid decoder.
-        """
-        decoder_state_proj = tf.contrib.layers.fully_connected(
-            inputs=decoder_state,
-            num_outputs=self._num_units,
-            activation_fn=None,
-            biases_initializer=None)
-        decoder_state_expand = tf.tile(
-            tf.expand_dims(
-                input=decoder_state_proj, axis=1),
-            [1, tf.shape(encoder_proj)[1], 1])
-        concated = tf.concat([decoder_state_expand, encoder_proj], axis=2)
-        # need reduce the first dimension
-        attention_weights = tf.contrib.layers.fully_connected(
-            inputs=tf.reshape(
-                concated, shape=[-1, self._num_units * 2]),
-            num_outputs=1,
-            activation_fn=tf.nn.tanh,
-            biases_initializer=None)
-        attention_weights_reshaped = tf.reshape(
-            attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1])
-        # normalize the attention weights using softmax
-        attention_weights_normed = tf.nn.softmax(
-            attention_weights_reshaped, dim=1)
-        scaled = tf.multiply(attention_weights_normed, encoder_vec)
-        context = tf.reduce_sum(scaled, axis=1)
-        return context
-
-    def _reset_padding(self,
-                       memory,
-                       memory_sequence_length,
-                       check_inner_dims_defined=True):
-        """Reset the padding part for encoder inputs.
-        This funtion comes from tensorflow's `_prepare_memory` function.
-        """
-        memory = nest.map_structure(
-                lambda m: ops.convert_to_tensor(m, name="memory"), memory)
-        if memory_sequence_length is not None:
-            memory_sequence_length = ops.convert_to_tensor(
-                memory_sequence_length, name="memory_sequence_length")
-        if check_inner_dims_defined:
-
-            def _check_dims(m):
-                if not m.get_shape()[2:].is_fully_defined():
-                    raise ValueError(
-                        "Expected memory %s to have fully defined inner dims, "
-                        "but saw shape: %s" % (m.name, m.get_shape()))
-
-            nest.map_structure(_check_dims, memory)
-        if memory_sequence_length is None:
-            seq_len_mask = None
-        else:
-            seq_len_mask = array_ops.sequence_mask(
-                memory_sequence_length,
-                maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
-                dtype=nest.flatten(memory)[0].dtype)
-            seq_len_batch_size = (memory_sequence_length.shape[0].value or
-                                  array_ops.shape(memory_sequence_length)[0])
-
-        def _maybe_mask(m, seq_len_mask):
-            rank = m.get_shape().ndims
-            rank = rank if rank is not None else array_ops.rank(m)
-            extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
-            m_batch_size = m.shape[0].value or array_ops.shape(m)[0]
-            if memory_sequence_length is not None:
-                message = ("memory_sequence_length and memory tensor "
-                           "batch sizes do not match.")
-                with ops.control_dependencies([
-                        check_ops.assert_equal(
-                            seq_len_batch_size, m_batch_size, message=message)
-                ]):
-                    seq_len_mask = array_ops.reshape(
-                        seq_len_mask,
-                        array_ops.concat(
-                            (array_ops.shape(seq_len_mask), extra_ones), 0))
-                return m * seq_len_mask
-            else:
-                return m
-
-        return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask),
-                                  memory)
-
-
-def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
-                   target_dict_dim, is_generating, beam_size,
-                   max_generation_length):
-    src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
-    src_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
-
-    src_embedding_weights = tf.get_variable("source_word_embeddings",
-                                            [source_dict_dim, embedding_dim])
-    src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)
-
-    src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
-    src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
-    # no peephole
-    encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
-        cell_fw=src_forward_cell,
-        cell_bw=src_reversed_cell,
-        inputs=src_embedding,
-        sequence_length=src_sequence_length,
-        dtype=tf.float32)
-
-    # concat the forward outputs and backward outputs
-    encoded_vec = tf.concat(encoder_outputs, axis=2)
-
-    # project the encoder outputs to size of decoder lstm
-    encoded_proj = tf.contrib.layers.fully_connected(
-        inputs=tf.reshape(
-            encoded_vec, shape=[-1, embedding_dim * 2]),
-        num_outputs=decoder_size,
-        activation_fn=None,
-        biases_initializer=None)
-    encoded_proj_reshape = tf.reshape(
-        encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])
-
-    # get init state for decoder lstm's H
-    backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
-    decoder_boot = tf.contrib.layers.fully_connected(
-        inputs=tf.reshape(
-            backword_first, shape=[-1, embedding_dim]),
-        num_outputs=decoder_size,
-        activation_fn=tf.nn.tanh,
-        biases_initializer=None)
-
-    # prepare the initial state for decoder lstm
-    cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
-    initial_state = LSTMStateTuple(cell_init, decoder_boot)
-
-    # create decoder lstm cell
-    decoder_cell = LSTMCellWithSimpleAttention(
-        decoder_size,
-        encoded_vec
-        if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size),
-        encoded_proj_reshape if not is_generating else
-        seq2seq.tile_batch(encoded_proj_reshape, beam_size),
-        src_sequence_length if not is_generating else
-        seq2seq.tile_batch(src_sequence_length, beam_size),
-        forget_bias=0.0)
-
-    output_layer = Dense(target_dict_dim, name='output_projection')
-
-    if not is_generating:
-        trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
-        trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
-        trg_embedding_weights = tf.get_variable(
-            "target_word_embeddings", [target_dict_dim, embedding_dim])
-        trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
-                                               trg_word_idx)
-
-        training_helper = seq2seq.TrainingHelper(
-            inputs=trg_embedding,
-            sequence_length=trg_sequence_length,
-            time_major=False,
-            name='training_helper')
-
-        training_decoder = seq2seq.BasicDecoder(
-            cell=decoder_cell,
-            helper=training_helper,
-            initial_state=initial_state,
-            output_layer=output_layer)
-
-        # get the max length of target sequence
-        max_decoder_length = tf.reduce_max(trg_sequence_length)
-
-        decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
-            decoder=training_decoder,
-            output_time_major=False,
-            impute_finished=True,
-            maximum_iterations=max_decoder_length)
-
-        decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
-        decoder_pred_train = tf.argmax(
-            decoder_logits_train, axis=-1, name='decoder_pred_train')
-        masks = tf.sequence_mask(
-            lengths=trg_sequence_length,
-            maxlen=max_decoder_length,
-            dtype=tf.float32,
-            name='masks')
-
-        # place holder of label sequence
-        lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])
-
-        # compute the loss
-        loss = seq2seq.sequence_loss(
-            logits=decoder_logits_train,
-            targets=lbl_word_idx,
-            weights=masks,
-            average_across_timesteps=True,
-            average_across_batch=True)
-
-        # return feeding list and loss operator
-        return {
-            'src_word_idx': src_word_idx,
-            'src_sequence_length': src_sequence_length,
-            'trg_word_idx': trg_word_idx,
-            'trg_sequence_length': trg_sequence_length,
-            'lbl_word_idx': lbl_word_idx
-        }, loss
-    else:
-        start_tokens = tf.ones([tf.shape(src_word_idx)[0], ],
-                               tf.int32) * START_TOKEN_IDX
-        # share the same embedding weights with target word
-        trg_embedding_weights = tf.get_variable(
-            "target_word_embeddings", [target_dict_dim, embedding_dim])
-
-        inference_decoder = beam_search_decoder.BeamSearchDecoder(
-            cell=decoder_cell,
-            embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens),
-            start_tokens=start_tokens,
-            end_token=END_TOKEN_IDX,
-            initial_state=tf.nn.rnn_cell.LSTMStateTuple(
-                tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
-                tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
-            beam_width=beam_size,
-            output_layer=output_layer)
-
-        decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
-            decoder=inference_decoder,
-            output_time_major=False,
-            #impute_finished=True,# error occurs
-            maximum_iterations=max_generation_length)
-
-        predicted_ids = decoder_outputs_decode.predicted_ids
-
-        return {
-            'src_word_idx': src_word_idx,
-            'src_sequence_length': src_sequence_length
-        }, predicted_ids
-
-
-def print_arguments(args):
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in vars(args).iteritems():
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-def padding_data(data, padding_size, value):
-    data = data + [value] * padding_size
-    return data[:padding_size]
-
-
-def save(sess, path, var_list=None, global_step=None):
-    saver = tf.train.Saver(var_list)
-    save_path = saver.save(sess, save_path=path, global_step=global_step)
-    print('Model save at %s' % save_path)
-
-
-def restore(sess, path, var_list=None):
-    # var_list = None returns the list of all saveable variables
-    saver = tf.train.Saver(var_list)
-    saver.restore(sess, save_path=path)
-    print('model restored from %s' % path)
-
-
-def adapt_batch_data(data):
-    src_seq = map(lambda x: x[0], data)
-    trg_seq = map(lambda x: x[1], data)
-    lbl_seq = map(lambda x: x[2], data)
-
-    src_sequence_length = np.array(
-        [len(seq) for seq in src_seq]).astype('int32')
-    src_seq_maxlen = np.max(src_sequence_length)
-
-    trg_sequence_length = np.array(
-        [len(seq) for seq in trg_seq]).astype('int32')
-    trg_seq_maxlen = np.max(trg_sequence_length)
-
-    src_seq = np.array(
-        [padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
-         for seq in src_seq]).astype('int32')
-
-    trg_seq = np.array(
-        [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
-         for seq in trg_seq]).astype('int32')
-
-    lbl_seq = np.array(
-        [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
-         for seq in lbl_seq]).astype('int32')
-
-    return {
-        'src_word_idx': src_seq,
-        'src_sequence_length': src_sequence_length,
-        'trg_word_idx': trg_seq,
-        'trg_sequence_length': trg_sequence_length,
-        'lbl_word_idx': lbl_seq
-    }
-
-
-def train():
-    feeding_dict, loss = seq_to_seq_net(
-        embedding_dim=args.embedding_dim,
-        encoder_size=args.encoder_size,
-        decoder_size=args.decoder_size,
-        source_dict_dim=args.dict_size,
-        target_dict_dim=args.dict_size,
-        is_generating=False,
-        beam_size=args.beam_size,
-        max_generation_length=args.max_generation_length)
-
-    global_step = tf.Variable(0, trainable=False, name='global_step')
-    trainable_params = tf.trainable_variables()
-    optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
-
-    gradients = tf.gradients(loss, trainable_params)
-    # may clip the parameters
-    clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
-
-    updates = optimizer.apply_gradients(
-        zip(gradients, trainable_params), global_step=global_step)
-
-    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
-
-    train_batch_generator = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
-        batch_size=args.batch_size)
-
-    test_batch_generator = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
-        batch_size=args.batch_size)
-
-    def do_validataion():
-        total_loss = 0.0
-        count = 0
-        for batch_id, data in enumerate(test_batch_generator()):
-            adapted_batch_data = adapt_batch_data(data)
-            outputs = sess.run([loss],
-                               feed_dict={
-                                   item[1]: adapted_batch_data[item[0]]
-                                   for item in feeding_dict.items()
-                               })
-            total_loss += outputs[0]
-            count += 1
-        return total_loss / count
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
-    config.gpu_options.allow_growth = True
-
-    with tf.Session(config=config) as sess:
-        init_g = tf.global_variables_initializer()
-        init_l = tf.local_variables_initializer()
-        sess.run(init_l)
-        sess.run(init_g)
-        for pass_id in xrange(args.pass_num):
-            pass_start_time = time.time()
-            words_seen = 0
-            for batch_id, data in enumerate(train_batch_generator()):
-                adapted_batch_data = adapt_batch_data(data)
-                words_seen += np.sum(adapted_batch_data['src_sequence_length'])
-                words_seen += np.sum(adapted_batch_data['trg_sequence_length'])
-                outputs = sess.run([updates, loss],
-                                   feed_dict={
-                                       item[1]: adapted_batch_data[item[0]]
-                                       for item in feeding_dict.items()
-                                   })
-                print("pass_id=%d, batch_id=%d, train_loss: %f" %
-                      (pass_id, batch_id, outputs[1]))
-            pass_end_time = time.time()
-            test_loss = do_validataion()
-            time_consumed = pass_end_time - pass_start_time
-            words_per_sec = words_seen / time_consumed
-            print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
-                  (pass_id, test_loss, words_per_sec, time_consumed))
-
-
-def infer():
-    feeding_dict, predicted_ids = seq_to_seq_net(
-        embedding_dim=args.embedding_dim,
-        encoder_size=args.encoder_size,
-        decoder_size=args.decoder_size,
-        source_dict_dim=args.dict_size,
-        target_dict_dim=args.dict_size,
-        is_generating=True,
-        beam_size=args.beam_size,
-        max_generation_length=args.max_generation_length)
-
-    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
-    test_batch_generator = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
-        batch_size=args.batch_size)
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
-    with tf.Session(config=config) as sess:
-        restore(sess, './checkpoint/tf_seq2seq-1500')
-        for batch_id, data in enumerate(test_batch_generator()):
-            src_seq = map(lambda x: x[0], data)
-
-            source_language_seq = [
-                src_dict[item] for seq in src_seq for item in seq
-            ]
-
-            src_sequence_length = np.array(
-                [len(seq) for seq in src_seq]).astype('int32')
-            src_seq_maxlen = np.max(src_sequence_length)
-            src_seq = np.array([
-                padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
-                for seq in src_seq
-            ]).astype('int32')
-
-            outputs = sess.run([predicted_ids],
-                               feed_dict={
-                                   feeding_dict['src_word_idx']: src_seq,
-                                   feeding_dict['src_sequence_length']:
-                                   src_sequence_length
-                               })
-
-            print("\nDecoder result comparison: ")
-            source_language_seq = ' '.join(source_language_seq).lstrip(
-                '<s>').rstrip('<e>').strip()
-            inference_seq = ''
-            print(" --> source: " + source_language_seq)
-            for item in outputs[0][0]:
-                if item[0] == END_TOKEN_IDX: break
-                inference_seq += ' ' + trg_dict.get(item[0], '<unk>')
-            print(" --> inference: " + inference_seq)
-
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    print_arguments(args)
-    if args.infer_only:
-        infer()
-    else:
-        train()
--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import time
-import numpy as np
-
-import tensorflow as tf
-
-DTYPE = tf.float32
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("mnist model benchmark.")
-    parser.add_argument(
-        '--batch_size', type=int, default=128, help='The minibatch size.')
-    parser.add_argument(
-        '--iterations', type=int, default=35, help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=5, help='The number of passes.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    args = parser.parse_args()
-    return args
-
-
-def run_benchmark(args):
-    def weight_variable(dtype, shape):
-        initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
-        return tf.Variable(initial)
-
-    def bias_variable(dtype, shape):
-        initial = tf.constant(0.1, shape=shape, dtype=dtype)
-        return tf.Variable(initial)
-
-    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
-    with tf.device(device):
-        images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
-        labels = tf.placeholder(tf.int64, shape=(None, ))
-
-        # conv1, relu, pool1
-        conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
-        conv1_bias = bias_variable(DTYPE, [20])
-        conv1 = tf.nn.conv2d(
-            images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
-        relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
-        pool1 = tf.nn.max_pool(
-            relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
-
-        # conv2, relu, pool2
-        conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
-        conv2_bias = bias_variable(DTYPE, [50])
-        conv2 = tf.nn.conv2d(
-            pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
-        relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
-        pool2 = tf.nn.max_pool(
-            relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
-
-        # FC 
-        pool_shape = pool2.get_shape().as_list()
-        hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
-        reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
-        fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
-        fc_bias = bias_variable(DTYPE, [10])
-        logits = tf.matmul(reshape, fc_weights) + fc_bias
-
-        # Get prediction
-        prediction = tf.nn.softmax(logits)
-
-        # Loss 
-        one_hot_labels = tf.one_hot(labels, depth=10)
-        cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
-        avg_cost = tf.reduce_mean(cost)
-
-        # Get accuracy
-        correct = tf.equal(tf.argmax(prediction, 1), labels)
-        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
-        # metrics, g_accuracy
-        with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
-            g_accuracy = tf.metrics.accuracy(
-                labels, tf.argmax(
-                    prediction, axis=1))
-            vars = tf.contrib.framework.get_variables(
-                scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
-            g_accuracy_reset_op = tf.variables_initializer(vars)
-
-        # Optimizer 
-        opt = tf.train.AdamOptimizer(
-            learning_rate=0.001, beta1=0.9, beta2=0.999)
-        train_op = opt.minimize(avg_cost)
-        # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
-
-    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
-
-    def eval_test():
-        sess.run(g_accuracy_reset_op)
-        for batch_id, data in enumerate(test_reader()):
-            images_data = np.array(
-                map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
-            labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
-
-            loss, acc, g_acc = sess.run(
-                [avg_cost, accuracy, g_accuracy],
-                feed_dict={images: images_data,
-                           labels: labels_data})
-        return g_acc[1]
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
-    config.gpu_options.allow_growth = True
-
-    with tf.Session(config=config) as sess:
-        init_g = tf.global_variables_initializer()
-        init_l = tf.local_variables_initializer()
-        sess.run(init_g)
-        sess.run(init_l)
-        for pass_id in range(args.pass_num):
-            sess.run(g_accuracy_reset_op)
-
-            pass_start = time.time()
-            for batch_id, data in enumerate(train_reader()):
-                images_data = np.array(
-                    map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
-                labels_data = np.array(map(lambda x: x[1], data)).astype(
-                    "int64")
-
-                start = time.time()
-                _, loss, acc, g_acc = sess.run(
-                    [train_op, avg_cost, accuracy, g_accuracy],
-                    feed_dict={images: images_data,
-                               labels: labels_data})
-                end = time.time()
-
-                print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
-                      (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
-
-            pass_end = time.time()
-            test_avg_acc = eval_test()
-
-            print(
-                "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
-                % (pass_id, g_acc[1], test_avg_acc,
-                   (pass_end - pass_start) / 1000))
-
-
-def print_arguments(args):
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-    run_benchmark(args)
--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py
-
-Get help: python resnet.py --help
-See performance on flowers: python resnet.py
-Train on cifar10: python resnet.py --data=cifar10 --with_test
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import time
-import numpy as np
-
-import tensorflow as tf
-
-DTYPE = tf.float32
-
-
-def parse_args():
-    parser = argparse.ArgumentParser('Convolution model benchmark.')
-    parser.add_argument(
-        '--model',
-        type=str,
-        choices=['resnet'],
-        default='resnet',
-        help='The model architecture.')
-    parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
-    parser.add_argument(
-        '--use_fake_data',
-        action='store_true',
-        help='use real data or fake data')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations',
-        type=int,
-        default=105,
-        help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=300, help='The number of passes.')
-    parser.add_argument(
-        '--order',
-        type=str,
-        default='NHWC',
-        choices=['NCHW', 'NHWC'],
-        help='The data order, now only support NCHW.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--data',
-        type=str,
-        default='flowers102',
-        choices=['flowers102', 'cifar10'],
-        help='The kinds of data.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--with_test',
-        action='store_true',
-        help='If set, test the testset during training.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    args = parser.parse_args()
-    return args
-
-
-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[
-        'with_test'] else vars(args)['iterations']
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-def fixed_padding(inputs, kernel_size, data_format):
-    """Pads the input along the spatial dimensions independently of input size.
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
-                 Should be a positive integer.
-    data_format: The input format ('channels_last' or 'channels_first').
-  Returns:
-    A tensor with the same format as the input with the data either intact
-    (if kernel_size == 1) or padded (if kernel_size > 1).
-  """
-    pad_total = kernel_size - 1
-    pad_beg = pad_total // 2
-    pad_end = pad_total - pad_beg
-
-    if data_format == 'channels_first':
-        padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end],
-                                        [pad_beg, pad_end]])
-    else:
-        padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
-                                        [pad_beg, pad_end], [0, 0]])
-    return padded_inputs
-
-
-def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
-    """Strided 2-D convolution with explicit padding."""
-    # The padding is consistent and is based only on `kernel_size`, not on the
-    # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
-    # This is consistent with PaddlePaddle.
-    # In addition, the calculation for output size in TensorFlow can refer: 
-    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc
-    if strides > 1:
-        inputs = fixed_padding(inputs, kernel_size, data_format)
-
-    return tf.layers.conv2d(
-        inputs=inputs,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=('SAME' if strides == 1 else 'VALID'),
-        use_bias=False,
-        kernel_initializer=tf.variance_scaling_initializer(),
-        data_format=data_format)
-
-
-def conv_bn(inputs,
-            filters,
-            kernel_size,
-            strides,
-            is_training,
-            data_format,
-            act=True):
-    # def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
-    # set fused=True for a significant performance boost. See
-    # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
-    inputs = conv2d_fixed_padding(
-        inputs=inputs,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        data_format=data_format)
-    inputs = tf.layers.batch_normalization(
-        inputs=inputs,
-        axis=1 if data_format == 'channels_first' else 3,
-        momentum=0.9,
-        epsilon=1e-05,
-        center=True,
-        scale=True,
-        training=is_training,
-        fused=True)
-    if act:
-        inputs = tf.nn.relu(inputs)
-    return inputs
-
-
-def basicblock(inputs, filters, is_training, projection_shortcut, strides,
-               data_format):
-    shortcut = inputs
-    if projection_shortcut is not None:
-        shortcut = projection_shortcut(inputs)
-    inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format)
-    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
-    inputs = inputs + shortcut
-    inputs = tf.nn.relu(inputs)
-    return inputs
-
-
-def bottleneck(inputs, filters, is_training, projection_shortcut, strides,
-               data_format):
-    shortcut = inputs
-    if projection_shortcut is not None:
-        shortcut = projection_shortcut(inputs)
-    inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format)
-    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
-    inputs = conv_bn(
-        inputs, filters * 4, 1, 1, is_training, data_format, act=False)
-    inputs = inputs + shortcut
-    inputs = tf.nn.relu(inputs)
-    return inputs
-
-
-def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name,
-                data_format):
-    # Bottleneck blocks end with 4x the number of filters as they start with
-    filters_out = 4 * filters if block_fn is bottleneck else filters
-
-    def projection_shortcut(inputs):
-        return conv2d_fixed_padding(
-            inputs=inputs,
-            filters=filters_out,
-            kernel_size=1,
-            strides=strides,
-            data_format=data_format)
-
-    # Only the first block per block_layer uses projection_shortcut and strides
-    inputs = block_fn(inputs, filters, is_training, projection_shortcut,
-                      strides, data_format)
-
-    for _ in range(1, blocks):
-        inputs = block_fn(inputs, filters, is_training, None, 1, data_format)
-
-    return tf.identity(inputs, name)
-
-
-def resnet_imagenet(depth, class_dim, data_format):
-    """Returns the ResNet model for a given size and number of output classes."""
-
-    def resnet_generator(block_fn,
-                         layers,
-                         num_classes,
-                         data_format='channels_last'):
-        if data_format is None:
-            data_format = ('channels_first'
-                           if tf.test.is_built_with_cuda() else 'channels_last')
-
-        def model(inputs, is_training):
-            """Constructs the ResNet model given the inputs."""
-            if data_format == 'channels_first':
-                # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
-                # This provides a large performance boost on GPU. See
-                # https://www.tensorflow.org/performance/performance_guide#data_formats
-                inputs = tf.transpose(inputs, [0, 3, 1, 2])
-
-            inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format)
-            inputs = tf.identity(inputs, 'initial_conv')
-            inputs = tf.layers.max_pooling2d(
-                inputs=inputs,
-                pool_size=3,
-                strides=2,
-                padding='SAME',
-                data_format=data_format)
-            inputs = tf.identity(inputs, 'initial_max_pool')
-            inputs = block_layer(inputs, 64, block_fn, layers[0], 1,
-                                 is_training, 'block_layer1', data_format)
-            inputs = block_layer(inputs, 128, block_fn, layers[1], 2,
-                                 is_training, 'block_layer2', data_format)
-            inputs = block_layer(inputs, 256, block_fn, layers[2], 2,
-                                 is_training, 'block_layer3', data_format)
-            inputs = block_layer(inputs, 512, block_fn, layers[3], 2,
-                                 is_training, 'block_layer4', data_format)
-            inputs = tf.layers.average_pooling2d(
-                inputs=inputs,
-                pool_size=7,
-                strides=1,
-                padding='VALID',
-                data_format=data_format)
-            inputs = tf.identity(inputs, 'final_avg_pool')
-            inputs = tf.reshape(inputs,
-                                [-1, 512 if block_fn is basicblock else 2048])
-            inputs = tf.layers.dense(inputs=inputs, units=num_classes)
-            inputs = tf.identity(inputs, 'final_dense')
-            return inputs
-
-        return model
-
-    model_params = {
-        18: {
-            'block': basicblock,
-            'layers': [2, 2, 2, 2]
-        },
-        34: {
-            'block': basicblock,
-            'layers': [3, 4, 6, 3]
-        },
-        50: {
-            'block': bottleneck,
-            'layers': [3, 4, 6, 3]
-        },
-        101: {
-            'block': bottleneck,
-            'layers': [3, 4, 23, 3]
-        },
-        152: {
-            'block': bottleneck,
-            'layers': [3, 8, 36, 3]
-        },
-        200: {
-            'block': bottleneck,
-            'layers': [3, 24, 36, 3]
-        }
-    }
-    if depth not in model_params:
-        raise ValueError('Not a valid depth:', depth)
-    params = model_params[depth]
-    return resnet_generator(params['block'], params['layers'], class_dim,
-                            data_format)
-
-
-def resnet_cifar10(depth, num_classes, data_format):
-    if depth % 6 != 2:
-        raise ValueError('depth must be 6n + 2:', depth)
-
-    num_blocks = (depth - 2) // 6
-
-    if data_format is None:
-        data_format = ('channels_first'
-                       if tf.test.is_built_with_cuda() else 'channels_last')
-
-    def model(inputs, is_training):
-        inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format)
-        inputs = tf.identity(inputs, 'initial_conv')
-        inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training,
-                             'block_layer1', data_format)
-        inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training,
-                             'block_layer2', data_format)
-        inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training,
-                             'block_layer3', data_format)
-        inputs = tf.layers.average_pooling2d(
-            inputs=inputs,
-            pool_size=8,
-            strides=1,
-            padding='VALID',
-            data_format=data_format)
-        inputs = tf.identity(inputs, 'final_avg_pool')
-        inputs = tf.reshape(inputs, [-1, 64])
-        inputs = tf.layers.dense(inputs=inputs, units=num_classes)
-        inputs = tf.identity(inputs, 'final_dense')
-        return inputs
-
-    return model
-
-
-def run_benchmark(args, data_format='channels_last', device='/cpu:0'):
-    """Our model_fn for ResNet to be used with our Estimator."""
-
-    class_dim = 1000
-    dshape = (None, 224, 224, 3)
-
-    pdshape = (3, 224, 224)
-    if args.data == 'flowers102':
-        class_dim = 102
-        dshape = (None, 224, 224, 3)
-        pdshape = (3, 224, 224)
-    elif args.data == 'cifar10':
-        class_dim = 10
-        dshape = (None, 32, 32, 3)
-        pdshape = (3, 32, 32)
-
-    with tf.device(device):
-        images = tf.placeholder(DTYPE, shape=dshape)
-        labels = tf.placeholder(tf.int64, shape=(None, ))
-        is_training = tf.placeholder('bool')
-        onehot_labels = tf.one_hot(labels, depth=class_dim)
-
-        network = resnet_cifar10(
-            32, class_dim,
-            data_format) if args.data == 'cifar10' else resnet_imagenet(
-                50, class_dim, data_format)
-
-        logits = network(inputs=images, is_training=is_training)
-
-        cross_entropy = tf.losses.softmax_cross_entropy(
-            logits=logits, onehot_labels=onehot_labels)
-        avg_cost = tf.reduce_mean(cross_entropy)
-
-        correct = tf.equal(tf.argmax(logits, 1), labels)
-        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
-        lr = 0.1 if args.data == 'cifar10' else 0.01
-        optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
-
-        # Batch norm requires update_ops to be added as a train_op dependency.
-        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        with tf.control_dependencies(update_ops):
-            train_op = optimizer.minimize(avg_cost)
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=100)
-
-    def test():
-        test_accs = []
-        for batch_id, data in enumerate(test_reader()):
-            test_images = np.array(
-                map(lambda x: np.transpose(x[0].reshape(pdshape),
-                axes=[1, 2, 0]), data)).astype("float32")
-            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
-            test_accs.append(
-                accuracy.eval(feed_dict={
-                    images: test_images,
-                    labels: test_labels,
-                    is_training: False
-                }))
-        print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" %
-              (pass_id, num_samples / train_elapsed, np.mean(test_accs)))
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
-    config.gpu_options.allow_growth = True
-
-    with tf.Session(config=config) as sess:
-        init_g = tf.global_variables_initializer()
-        init_l = tf.local_variables_initializer()
-        sess.run(init_g)
-        sess.run(init_l)
-
-        if args.use_fake_data:
-            data = train_reader().next()
-            images_data = np.array(
-                    map(lambda x: np.transpose(x[0].reshape(pdshape),
-                    axes=[1, 2, 0]), data)).astype("float32")
-            labels_data = np.array(map(lambda x: x[1], data)).astype('int64')
-        iters, num_samples, start_time = 0, 0, 0.0
-        for pass_id in range(args.pass_num):
-            if iters == args.iterations:
-                break
-            train_accs = []
-            train_losses = []
-            for batch_id, data in enumerate(train_reader()):
-                if iters == args.skip_batch_num:
-                    start_time = time.time()
-                    num_samples = 0
-                if iters == args.iterations:
-                    break
-                if not args.use_fake_data:
-                    images_data = np.array(
-                        map(lambda x: np.transpose(x[0].reshape(pdshape),
-                        axes=[1, 2, 0]), data)).astype("float32")
-                    labels_data = np.array(map(lambda x: x[1], data)).astype(
-                        'int64')
-                _, loss, acc = sess.run([train_op, avg_cost, accuracy],
-                                        feed_dict={
-                                            images: images_data,
-                                            labels: labels_data,
-                                            is_training: True
-                                        })
-                iters += 1
-                train_accs.append(acc)
-                train_losses.append(loss)
-                num_samples += len(data)
-                print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" %
-                      (pass_id, iters, loss, acc))
-
-            train_elapsed = time.time() - start_time
-            print("Pass=%d, Loss=%f, Accuray=%f\n" %
-                  (pass_id, np.mean(train_losses), np.mean(train_accs)))
-
-            # evaluation
-            if args.with_test:
-                test()
-
-        if not args.with_test:
-            duration = time.time() - start_time
-            examples_per_sec = num_samples / duration
-            sec_per_batch = duration / (iters - args.skip_batch_num)
-
-            print('Total examples: %d, total time: %.5f' %
-                  (num_samples, duration))
-            print('%.5f examples/sec, %.5f sec/batch' %
-                  (examples_per_sec, sec_per_batch))
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-    if tf.test.is_built_with_cuda():
-        device = '/device:GPU:0'
-        if args.order == 'NHWC':
-            data_format = 'channels_last'
-        else:
-            data_format = 'channels_first'
-    else:
-        device = '/cpu:0'
-        if args.order == 'NHWC':
-            data_format = 'channels_last'
-        else:
-            raise ValueError('Only support NHWC order in CPU mode')
-
-    run_benchmark(args, data_format, device)
--- a/benchmark/tensorflow/rnn/README.md
+++ b/benchmark/tensorflow/rnn/README.md
-You also should install tflearn:
-
-```bash
-pip install -r requirements.txt
-```
--- a/benchmark/tensorflow/rnn/reader.py
+++ b/benchmark/tensorflow/rnn/reader.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os.path
-import io
-import numpy as np
-import tensorflow as tf
-
-# tflearn
-import tflearn
-from tflearn.data_utils import to_categorical, pad_sequences
-from tflearn.datasets import imdb
-
-FLAGS = tf.app.flags.FLAGS
-
-
-class DataSet(object):
-    def __init__(self, data, labels):
-        assert data.shape[0] == labels.shape[0], (
-            'data.shape: %s labels.shape: %s' % (data.shape, labels.shape))
-        self._num_examples = data.shape[0]
-
-        self._data = data
-        self._labels = labels
-        self._epochs_completed = 0
-        self._index_in_epoch = 0
-
-    @property
-    def data(self):
-        return self._data
-
-    @property
-    def labels(self):
-        return self._labels
-
-    @property
-    def num_examples(self):
-        return self._num_examples
-
-    @property
-    def epochs_completed(self):
-        return self._epochs_completed
-
-    def next_batch(self, batch_size):
-        assert batch_size <= self._num_examples
-
-        start = self._index_in_epoch
-        self._index_in_epoch += batch_size
-        if self._index_in_epoch > self._num_examples:
-            # Finished epoch
-            self._epochs_completed += 1
-            # Shuffle the data
-            perm = np.arange(self._num_examples)
-            np.random.shuffle(perm)
-            self._data = self._data[perm]
-            self._labels = self._labels[perm]
-            # Start next epoch
-            start = 0
-            self._index_in_epoch = batch_size
-
-        end = self._index_in_epoch
-
-        return self._data[start:end], self._labels[start:end]
-
-
-def create_datasets(file_path, vocab_size=30000, val_fraction=0.0):
-
-    # IMDB Dataset loading
-    train, test, _ = imdb.load_data(
-        path=file_path,
-        n_words=vocab_size,
-        valid_portion=val_fraction,
-        sort_by_len=False)
-    trainX, trainY = train
-    testX, testY = test
-
-    # Data preprocessing
-    # Sequence padding
-    trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.)
-    testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.)
-    # Converting labels to binary vectors
-    trainY = to_categorical(trainY, nb_classes=2)
-    testY = to_categorical(testY, nb_classes=2)
-
-    train_dataset = DataSet(trainX, trainY)
-
-    return train_dataset
-
-
-def main():
-    create_datasets('imdb.pkl')
-
-
-if __name__ == "__main__":
-    main()
--- a/benchmark/tensorflow/rnn/requirements.txt
+++ b/benchmark/tensorflow/rnn/requirements.txt
-tflearn
--- a/benchmark/tensorflow/rnn/rnn.py
+++ b/benchmark/tensorflow/rnn/rnn.py
-#!/usr/bin/env python
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import math
-import time
-import numpy as np
-from datetime import datetime
-
-import reader
-import tensorflow as tf
-from tensorflow.python.ops import rnn
-
-FLAGS = tf.app.flags.FLAGS
-
-tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
-tf.app.flags.DEFINE_boolean('forward_only', False,
-                            """Only run the forward pass.""")
-tf.app.flags.DEFINE_boolean('forward_backward_only', False,
-                            """Only run the forward-forward pass.""")
-tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('emb_size', 128, """Number of batches to run.""")
-tf.app.flags.DEFINE_boolean('log_device_placement', False,
-                            """Whether to log device placement.""")
-
-VOCAB_SIZE = 30000
-NUM_CLASS = 2
-
-
-def get_feed_dict(x_data, y_data=None):
-    feed_dict = {}
-
-    if y_data is not None:
-        feed_dict[y_input] = y_data
-
-    for i in xrange(x_data.shape[0]):
-        feed_dict[x_input[i]] = x_data[i, :, :]
-
-    return feed_dict
-
-
-def get_incoming_shape(incoming):
-    """ Returns the incoming data shape """
-    if isinstance(incoming, tf.Tensor):
-        return incoming.get_shape().as_list()
-    elif type(incoming) in [np.array, list, tuple]:
-        return np.shape(incoming)
-    else:
-        raise Exception("Invalid incoming layer.")
-
-
-# Note input * W is done in LSTMCell, 
-# which is different from PaddlePaddle
-def single_lstm(name,
-                incoming,
-                n_units,
-                use_peepholes=True,
-                return_seq=False,
-                return_state=False):
-    with tf.name_scope(name) as scope:
-        cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
-        output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
-        out = output if return_seq else output[-1]
-        return (out, _cell_state) if return_state else out
-
-
-def lstm(name,
-         incoming,
-         n_units,
-         use_peepholes=True,
-         return_seq=False,
-         return_state=False,
-         num_layers=1):
-    with tf.name_scope(name) as scope:
-        lstm_cell = tf.nn.rnn_cell.LSTMCell(
-            n_units, use_peepholes=use_peepholes)
-        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
-        initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
-        if not isinstance(incoming, list):
-            # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
-            incoming = [
-                tf.squeeze(input_, [1])
-                for input_ in tf.split(1, FLAGS.max_len, incoming)
-            ]
-        outputs, state = tf.nn.rnn(cell,
-                                   incoming,
-                                   initial_state=initial_state,
-                                   dtype=tf.float32)
-        out = outputs if return_seq else outputs[-1]
-        return (out, _cell_state) if return_state else out
-
-
-def embedding(name, incoming, vocab_size, emb_size):
-    with tf.name_scope(name) as scope:
-        #with tf.device("/cpu:0"):
-        embedding = tf.get_variable(
-            name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
-        out = tf.nn.embedding_lookup(embedding, incoming)
-        return out
-
-
-def fc(name, inpOp, nIn, nOut, act=True):
-    with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(
-            name + '_w', [nIn, nOut],
-            initializer=tf.truncated_normal_initializer(
-                stddev=0.01, dtype=tf.float32),
-            dtype=tf.float32)
-
-        biases = tf.get_variable(
-            name + '_b', [nOut],
-            initializer=tf.constant_initializer(
-                value=0.0, dtype=tf.float32),
-            dtype=tf.float32,
-            trainable=True)
-
-        net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
-                  tf.matmul(inpOp, kernel) + biases
-
-        return net
-
-
-def inference(seq):
-    net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size)
-    print "emb:", get_incoming_shape(net)
-    net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers)
-    print "lstm:", get_incoming_shape(net)
-    net = fc('fc1', net, FLAGS.hidden_size, 2)
-    return net
-
-
-def loss(logits, labels):
-    # one label index for one sample
-    labels = tf.cast(labels, tf.float32)
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
-        logits, labels, name='cross_entropy_per_example')
-    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
-    tf.add_to_collection('losses', cross_entropy_mean)
-    return tf.add_n(tf.get_collection('losses'), name='total_loss')
-
-
-def time_tensorflow_run(session, target, x_input, y_input, info_string):
-    num_steps_burn_in = 50
-    total_duration = 0.0
-    total_duration_squared = 0.0
-    if not isinstance(target, list):
-        target = [target]
-    target_op = tf.group(*target)
-    train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
-    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-        start_time = time.time()
-        data, label = train_dataset.next_batch(FLAGS.batch_size)
-        _ = session.run(target_op, feed_dict={x_input: data, y_input: label})
-        duration = time.time() - start_time
-        if i > num_steps_burn_in:
-            if not i % 10:
-                print('%s: step %d, duration = %.3f' %
-                      (datetime.now(), i - num_steps_burn_in, duration))
-            total_duration += duration
-            total_duration_squared += duration * duration
-    mn = total_duration / FLAGS.num_batches
-    vr = total_duration_squared / FLAGS.num_batches - mn * mn
-    sd = math.sqrt(vr)
-    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
-          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
-
-
-def run_benchmark():
-    with tf.Graph().as_default():
-        global_step = 0
-        with tf.device('/cpu:0'):
-            global_step = tf.Variable(0, trainable=False)
-        with tf.device('/gpu:0'):
-            #x_input = tf.placeholder(tf.int32, [None, FLAGS.max_len], name="x_input")
-            #y_input = tf.placeholder(tf.int32, [None, NUM_CLASS], name="y_input")
-            x_input = tf.placeholder(
-                tf.int32, [FLAGS.batch_size, FLAGS.max_len], name="x_input")
-            y_input = tf.placeholder(
-                tf.int32, [FLAGS.batch_size, NUM_CLASS], name="y_input")
-            # Generate some dummy sequnce.
-
-            last_layer = inference(x_input)
-
-            objective = loss(last_layer, y_input)
-            opt = tf.train.AdamOptimizer(0.001)
-            grads = opt.compute_gradients(objective)
-            apply_gradient_op = opt.apply_gradients(
-                grads, global_step=global_step)
-
-            init = tf.initialize_all_variables()
-            sess = tf.Session(config=tf.ConfigProto(
-                allow_soft_placement=True,
-                log_device_placement=FLAGS.log_device_placement))
-            sess.run(init)
-
-            run_forward = True
-            run_forward_backward = True
-            if FLAGS.forward_only and FLAGS.forward_backward_only:
-                raise ValueError("Cannot specify --forward_only and "
-                                 "--forward_backward_only at the same time.")
-            if FLAGS.forward_only:
-                run_forward_backward = False
-            elif FLAGS.forward_backward_only:
-                run_forward = False
-
-            if run_forward:
-                time_tensorflow_run(sess, last_layer, x_input, y_input,
-                                    "Forward")
-
-            if run_forward_backward:
-                with tf.control_dependencies([apply_gradient_op]):
-                    train_op = tf.no_op(name='train')
-                time_tensorflow_run(sess, [train_op, objective], x_input,
-                                    y_input, "Forward-backward")
-
-
-def main(_):
-    run_benchmark()
-
-
-if __name__ == '__main__':
-    tf.app.run()
--- a/benchmark/tensorflow/rnn/rnn_multi_gpu.py
+++ b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
-#!/usr/bin/env python
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import re
-import math
-import time
-import numpy as np
-from datetime import datetime
-
-import reader
-import tensorflow as tf
-from tensorflow.python.ops import rnn
-
-FLAGS = tf.app.flags.FLAGS
-
-tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('emb_size', 64, """Number of batches to run.""")
-tf.app.flags.DEFINE_boolean('log_device_placement', False,
-                            """Whether to log device placement.""")
-tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
-
-VOCAB_SIZE = 30000
-NUM_CLASS = 2
-
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
-NUM_EPOCHS_PER_DECAY = 50
-INITIAL_LEARNING_RATE = 0.1
-LEARNING_RATE_DECAY_FACTOR = 0.1
-TOWER_NAME = 'tower'
-
-train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
-
-
-def get_incoming_shape(incoming):
-    """ Returns the incoming data shape """
-    if isinstance(incoming, tf.Tensor):
-        return incoming.get_shape().as_list()
-    elif type(incoming) in [np.array, list, tuple]:
-        return np.shape(incoming)
-    else:
-        raise Exception("Invalid incoming layer.")
-
-
-# Note input * W is done in LSTMCell, 
-# which is different from PaddlePaddle
-def single_lstm(name,
-                incoming,
-                n_units,
-                use_peepholes=True,
-                return_seq=False,
-                return_state=False):
-    with tf.name_scope(name) as scope:
-        cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
-        output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
-        out = output if return_seq else output[-1]
-        return (out, _cell_state) if return_state else out
-
-
-def lstm(name,
-         incoming,
-         n_units,
-         use_peepholes=True,
-         return_seq=False,
-         return_state=False,
-         num_layers=1):
-    with tf.name_scope(name) as scope:
-        lstm_cell = tf.nn.rnn_cell.LSTMCell(
-            n_units, use_peepholes=use_peepholes)
-        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
-        initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
-        if not isinstance(incoming, list):
-            # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
-            incoming = [
-                tf.squeeze(input_, [1])
-                for input_ in tf.split(1, FLAGS.max_len, incoming)
-            ]
-        outputs, state = tf.nn.rnn(cell,
-                                   incoming,
-                                   initial_state=initial_state,
-                                   dtype=tf.float32)
-        out = outputs if return_seq else outputs[-1]
-        return (out, _cell_state) if return_state else out
-
-
-def embedding(name, incoming, vocab_size, emb_size):
-    with tf.name_scope(name) as scope:
-        #with tf.device("/cpu:0"):
-        embedding = tf.get_variable(
-            name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
-        out = tf.nn.embedding_lookup(embedding, incoming)
-        return out
-
-
-def fc(name, inpOp, nIn, nOut, act=True):
-    with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(
-            name + '_w', [nIn, nOut],
-            initializer=tf.truncated_normal_initializer(
-                stddev=0.01, dtype=tf.float32),
-            dtype=tf.float32)
-
-        biases = tf.get_variable(
-            name + '_b', [nOut],
-            initializer=tf.constant_initializer(
-                value=0.0, dtype=tf.float32),
-            dtype=tf.float32,
-            trainable=True)
-
-        net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
-                  tf.matmul(inpOp, kernel) + biases
-
-        return net
-
-
-def inference(seq):
-    net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size)
-    print "emb:", get_incoming_shape(net)
-    net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers)
-    print "lstm:", get_incoming_shape(net)
-    net = fc('fc1', net, FLAGS.hidden_size, 2)
-    return net
-
-
-def loss(logits, labels):
-    # one label index for one sample
-    #labels = tf.cast(labels, tf.int64)
-    # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-    #                logits, labels, name='cross_entropy_per_example')
-    labels = tf.cast(labels, tf.float32)
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
-        logits, labels, name='cross_entropy_per_example')
-    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
-    tf.add_to_collection('losses', cross_entropy_mean)
-    return tf.add_n(tf.get_collection('losses'), name='total_loss')
-
-
-def tower_loss(scope):
-    """Calculate the total loss on a single tower running the model.
-    Args:
-        scope: unique prefix string identifying the tower, e.g. 'tower_0'
-    Returns:
-        Tensor of shape [] containing the total loss for a batch of data
-    """
-    data, label = train_dataset.next_batch(FLAGS.batch_size)
-
-    # Build a Graph that computes the logits predictions from the
-    # inference model.
-    last_layer = inference(data)
-
-    # Build the portion of the Graph calculating the losses. Note that we will
-    # assemble the total_loss using a custom function below.
-    #_ = loss(last_layer, label)
-    _ = loss(last_layer, label)
-
-    # Assemble all of the losses for the current tower only.
-    losses = tf.get_collection('losses', scope)
-
-    # Calculate the total loss for the current tower.
-    total_loss = tf.add_n(losses, name='total_loss')
-
-    # Compute the moving average of all individual losses and the total loss.
-    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
-    loss_averages_op = loss_averages.apply(losses + [total_loss])
-
-    # Attach a scalar summary to all individual losses and the total loss; do the
-    # same for the averaged version of the losses.
-    for l in losses + [total_loss]:
-        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
-        # session. This helps the clarity of presentation on tensorboard.
-        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
-        # Name each loss as '(raw)' and name the moving average version of the loss
-        # as the original loss name.
-        tf.scalar_summary(loss_name + ' (raw)', l)
-        #tf.scalar_summary(loss_name, loss_averages.average(l))
-
-    with tf.control_dependencies([loss_averages_op]):
-        total_loss = tf.identity(total_loss)
-    return total_loss
-
-
-def average_gradients(tower_grads):
-    """Calculate the average gradient for each shared variable across all towers.
-  Note that this function provides a synchronization point across all towers.
-  Args:
-    tower_grads: List of lists of (gradient, variable) tuples. The outer list
-      is over individual gradients. The inner list is over the gradient
-      calculation for each tower.
-  Returns:
-     List of pairs of (gradient, variable) where the gradient has been averaged
-     across all towers.
-  """
-    average_grads = []
-    for grad_and_vars in zip(*tower_grads):
-        # Note that each grad_and_vars looks like the following:
-        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-        grads = []
-        for g, _ in grad_and_vars:
-            # Add 0 dimension to the gradients to represent the tower.
-            expanded_g = tf.expand_dims(g, 0)
-
-            # Append on a 'tower' dimension which we will average over below.
-            grads.append(expanded_g)
-
-        # Average over the 'tower' dimension.
-        grad = tf.concat(0, grads)
-        grad = tf.reduce_mean(grad, 0)
-
-        # Keep in mind that the Variables are redundant because they are shared
-        # across towers. So .. we will just return the first tower's pointer to
-        # the Variable.
-        v = grad_and_vars[0][1]
-        grad_and_var = (grad, v)
-        average_grads.append(grad_and_var)
-    return average_grads
-
-
-def time_tensorflow_run(session, target):
-    num_steps_burn_in = 80
-    total_duration = 0.0
-    total_duration_squared = 0.0
-    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-        start_time = time.time()
-        _ = session.run(target, feed_dict={x_input: data, y_input: label})
-        _, loss_value = session.run(target)
-        duration = time.time() - start_time
-        if i > num_steps_burn_in:
-            if not i % 10:
-                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
-                examples_per_sec = num_examples_per_step / duration
-                # sec_per_batch = duration / FLAGS.num_gpus
-                sec_per_batch = duration
-
-                format_str = (
-                    '%s: step %d, loss= %.2f (%.1f examples/sec; %.3f '
-                    'sec/batch batch_size= %d)')
-                print(format_str %
-                      (datetime.now(), i - num_steps_burn_in, loss_value,
-                       duration, sec_per_batch, num_examples_per_step))
-
-            total_duration += duration
-            total_duration_squared += duration * duration
-
-    mn = total_duration / FLAGS.num_batches
-    vr = total_duration_squared / FLAGS.num_batches - mn * mn
-    sd = math.sqrt(vr)
-    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
-          (datetime.now(), FLAGS.num_batches, mn, sd))
-
-
-def run_benchmark():
-    with tf.Graph().as_default(), tf.device('/cpu:0'):
-        # Create a variable to count the number of train() calls. This equals the
-        # number of batches processed * FLAGS.num_gpus.
-        global_step = tf.get_variable(
-            'global_step', [],
-            initializer=tf.constant_initializer(0),
-            trainable=False)
-
-        # Calculate the learning rate schedule.
-        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
-                                 FLAGS.batch_size)
-        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
-
-        # Create an optimizer that performs gradient descent.
-        opt = tf.train.AdamOptimizer(0.001)
-
-        #train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
-
-        # Calculate the gradients for each model tower.
-        tower_grads = []
-        for i in xrange(FLAGS.num_gpus):
-            with tf.device('/gpu:%d' % i):
-                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
-                    # Calculate the loss for one tower of the model. This function
-                    # constructs the entire model but shares the variables across
-                    # all towers.
-                    loss = tower_loss(scope)
-
-                    # Reuse variables for the next tower.
-                    tf.get_variable_scope().reuse_variables()
-
-                    # Retain the summaries from the final tower.
-                    # summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
-
-                    # Calculate the gradients for the batch of data on this tower.
-                    grads = opt.compute_gradients(loss)
-
-                    # Keep track of the gradients across all towers.
-                    tower_grads.append(grads)
-
-        # We must calculate the mean of each gradient. Note that this is the
-        # synchronization point across all towers.
-        grads = average_gradients(tower_grads)
-
-        # Apply the gradients to adjust the shared variables.
-        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-        # Group all updates to into a single train op.
-        train_op = tf.group(apply_gradient_op)
-
-        # Build an initialization operation.
-        init = tf.initialize_all_variables()
-
-        # Start running operations on the Graph. allow_soft_placement must be set to
-        # True to build towers on GPU, as some of the ops do not have GPU
-        # implementations.
-        sess = tf.Session(config=tf.ConfigProto(
-            allow_soft_placement=True,
-            log_device_placement=FLAGS.log_device_placement))
-        sess.run(init)
-        time_tensorflow_run(sess, [train_op, loss])
-
-
-def main(_):
-    run_benchmark()
-
-
-if __name__ == '__main__':
-    tf.app.run()
--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
-#!/bin/bash
-
-set -e
-
-function test() {
-  lstm_num=$1
-  batch_size=$2
-  hid_size=$3
-  prefix=$4
-  python rnn.py --num_layers=${lstm_num} --batch_size=$batch_size \
-      --hidden_size=${hid_size} \
-      --forward_backward_only=1 \
-       > logs/1gpu-${lstm_num}lstm-batch${batch_size}-hid${hid_size}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-#--lstm_num--batch_size--hidden_size--#
-test 2 64 256 
-test 2 64 512 
-test 2 64 1280 
-
-test 2 128 256 
-test 2 128 512 
-test 2 128 1280 
-
-test 2 256 256 
-test 2 256 512 
-test 2 256 1280 
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
-#!/bin/bash
-
-set -e
-
-function test() {
-  num_gpu=$1
-  lstm_num=$2
-  hid_size=$3
-  batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
-  batch_size=$4
-  python rnn_multi_gpu.py --num_layers=${lstm_num} --batch_size=$batch_per_gpu \
-      --num_gpus=${num_gpu} \
-      --hidden_size=${hid_size} \
-      --forward_backward_only=1 \
-      > logs/${num_gpu}gpu-${lstm_num}lstm-hid${hid_size}-batch${batch_size}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-#--num_gpus--lstm_num--hiddne_size--batch_size--#
-test 4 2 256 128 
-test 4 2 256 256 
-test 4 2 256 512 
-
-test 4 2 512 128 
-test 4 2 512 256 
-test 4 2 512 512 
--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import tensorflow as tf
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("LSTM model benchmark.")
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=32,
-        help='The sequence number of a batch data. (default: %(default)d)')
-    parser.add_argument(
-        '--stacked_num',
-        type=int,
-        default=5,
-        help='Number of lstm layers to stack. (default: %(default)d)')
-    parser.add_argument(
-        '--embedding_dim',
-        type=int,
-        default=512,
-        help='Dimension of embedding table. (default: %(default)d)')
-    parser.add_argument(
-        '--hidden_dim',
-        type=int,
-        default=512,
-        help='Hidden size of lstm unit. (default: %(default)d)')
-    parser.add_argument(
-        '--pass_num',
-        type=int,
-        default=10,
-        help='Epoch number to train. (default: %(default)d)')
-    parser.add_argument(
-        '--learning_rate',
-        type=float,
-        default=0.0002,
-        help='Learning rate used to train. (default: %(default)f)')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    args = parser.parse_args()
-    return args
-
-
-def print_arguments(args):
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-def dynamic_lstm_model(dict_size,
-                       embedding_dim,
-                       hidden_dim,
-                       stacked_num,
-                       class_num=2,
-                       is_train=True):
-    word_idx = tf.placeholder(tf.int64, shape=[None, None])
-    sequence_length = tf.placeholder(tf.int64, shape=[None, ])
-
-    embedding_weights = tf.get_variable('word_embeddings',
-                                        [dict_size, embedding_dim])
-    embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
-
-    lstm_cell = tf.nn.rnn_cell.LSTMCell(
-        num_units=hidden_dim, use_peepholes=False)
-    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
-
-    # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
-    _, final_state = tf.nn.dynamic_rnn(
-        cell=stacked_cell,
-        inputs=embedding,
-        dtype=tf.float32,
-        sequence_length=sequence_length)
-
-    w = tf.Variable(
-        tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
-    bias = tf.Variable(
-        tf.constant(
-            value=0.0, shape=[class_num], dtype=tf.float32))
-    prediction = tf.matmul(final_state[-1][1], w) + bias
-
-    if not is_train:
-        return (word_idx, sequence_length), tf.nn.softmax(prediction)
-
-    label = tf.placeholder(tf.int64, shape=[None, ])
-    loss = tf.nn.softmax_cross_entropy_with_logits(
-        labels=tf.one_hot(label, 2), logits=prediction)
-    avg_loss = tf.reduce_mean(loss)
-
-    correct_count = tf.equal(tf.argmax(prediction, 1), label)
-    acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
-
-    with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
-        g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
-        vars = tf.contrib.framework.get_variables(
-            scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
-        reset_op = tf.variables_initializer(vars)
-
-    return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
-
-
-def padding_data(data, padding_size, value):
-    data = data + [value] * padding_size
-    return data[:padding_size]
-
-
-def train(args):
-    word_dict = paddle.dataset.imdb.word_dict()
-    dict_size = len(word_dict)
-
-    feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
-        dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
-
-    adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
-    train_op = adam_optimizer.minimize(avg_loss)
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=25000),
-        batch_size=args.batch_size)
-
-    test_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.test(word_dict), buf_size=25000),
-        batch_size=args.batch_size)
-
-    def do_validation(sess):
-        sess.run(reset_op)
-        for batch_id, data in enumerate(test_reader()):
-            word_idx = map(lambda x: x[0], data)
-            sequence_length = np.array(
-                [len(seq) for seq in word_idx]).astype('int64')
-            maxlen = np.max(sequence_length)
-            word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
-            word_idx = np.array(word_idx).astype('int64')
-            label = np.array(map(lambda x: x[1], data)).astype('int64')
-
-            _, loss, fetch_acc, fetch_g_acc = sess.run(
-                [train_op, avg_loss, acc, g_acc],
-                feed_dict={
-                    feeding_list[0]: word_idx,
-                    feeding_list[1]: sequence_length,
-                    feeding_list[2]: label
-                })
-
-        return fetch_g_acc[1]
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
-    config.gpu_options.allow_growth = True
-    with tf.Session(config=config) as sess:
-        init_g = tf.global_variables_initializer()
-        init_l = tf.local_variables_initializer()
-        sess.run(init_l)
-        sess.run(init_g)
-
-        for pass_id in xrange(args.pass_num):
-            # clear accuracy local variable 
-            sess.run(reset_op)
-            pass_start_time = time.time()
-            words_seen = 0
-
-            for batch_id, data in enumerate(train_reader()):
-                word_idx = map(lambda x: x[0], data)
-                sequence_length = np.array(
-                    [len(seq) for seq in word_idx]).astype('int64')
-                words_seen += np.sum(sequence_length)
-                maxlen = np.max(sequence_length)
-                word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
-                word_idx = np.array(word_idx).astype('int64')
-                label = np.array(map(lambda x: x[1], data)).astype('int64')
-
-                _, loss, fetch_acc, fetch_g_acc = sess.run(
-                    [train_op, avg_loss, acc, g_acc],
-                    feed_dict={
-                        feeding_list[0]: word_idx,
-                        feeding_list[1]: sequence_length,
-                        feeding_list[2]: label
-                    })
-
-                print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
-                      % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
-
-            pass_end_time = time.time()
-            time_consumed = pass_end_time - pass_start_time
-            words_per_sec = words_seen / time_consumed
-            test_acc = do_validation(sess)
-            print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
-                  (pass_id, test_acc, words_per_sec, time_consumed))
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-
-    if args.infer_only:
-        pass
-    else:
-        train(args)
--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in TensorFlow"""
-import tensorflow as tf
-import numpy as np
-import argparse
-import time
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--skip_batch_num',
-    type=int,
-    default=5,
-    help='The first num of minibatch num to skip, for better performance test')
-parser.add_argument(
-    '--iterations', type=int, default=80, help='The number of minibatches.')
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='GPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NHWC',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, NCHW=[batch, channels, height, width].'
-    'Only support NHWC right now.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-args = parser.parse_args()
-
-
-class VGG16Model(object):
-    def __init__(self):
-        self.parameters = []
-
-    def batch_norm_relu(self, inputs, is_training):
-        """Performs a batch normalization followed by a ReLU."""
-        # We set fused=True for a significant speed boost. See
-        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
-        inputs = tf.layers.batch_normalization(
-            inputs=inputs,
-            axis=1 if args.data_format == 'NCHW' else -1,
-            momentum=0.9,
-            epsilon=1e-05,
-            center=True,
-            scale=True,
-            training=is_training,
-            fused=True)
-        inputs = tf.nn.relu(inputs)
-        return inputs
-
-    def conv_bn_layer(self,
-                      name,
-                      images,
-                      kernel_shape,
-                      is_training,
-                      drop_rate=0.0):
-        with tf.name_scope(name) as scope:
-            kernel = tf.Variable(
-                tf.truncated_normal(
-                    kernel_shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            conv = tf.nn.conv2d(
-                images,
-                kernel, [1, 1, 1, 1],
-                data_format=args.data_format,
-                padding='SAME')
-            biases = tf.Variable(
-                tf.constant(
-                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(conv, biases)
-            out = self.batch_norm_relu(out, is_training)
-            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
-            return out
-
-    def fc_layer(self, name, inputs, shape):
-        with tf.name_scope(name) as scope:
-            fc_w = tf.Variable(
-                tf.truncated_normal(
-                    shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            fc_b = tf.Variable(
-                tf.constant(
-                    0.0, shape=[shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
-            return out
-
-    def network(self, images, class_dim, is_training):
-        """ VGG16 model structure.
-
-            TODO(kuke): enable this network to support the 'NCHW' data format
-        """
-
-        # conv1
-        conv1_1 = self.conv_bn_layer(
-            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
-        conv1_2 = self.conv_bn_layer(
-            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
-        # pool1
-        pool1 = tf.nn.max_pool(
-            conv1_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool1')
-        # conv2
-        conv2_1 = self.conv_bn_layer(
-            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
-        conv2_2 = self.conv_bn_layer(
-            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
-        # pool2
-        pool2 = tf.nn.max_pool(
-            conv2_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool2')
-        # conv3
-        conv3_1 = self.conv_bn_layer(
-            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
-        conv3_2 = self.conv_bn_layer(
-            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
-        conv3_3 = self.conv_bn_layer(
-            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
-        # pool3
-        pool3 = tf.nn.max_pool(
-            conv3_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool3')
-        # conv4
-        conv4_1 = self.conv_bn_layer(
-            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
-        conv4_2 = self.conv_bn_layer(
-            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv4_3 = self.conv_bn_layer(
-            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool4
-        pool4 = tf.nn.max_pool(
-            conv4_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # conv5
-        conv5_1 = self.conv_bn_layer(
-            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_2 = self.conv_bn_layer(
-            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_3 = self.conv_bn_layer(
-            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool5
-        pool5 = tf.nn.max_pool(
-            conv5_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # flatten
-        shape = int(np.prod(pool5.get_shape()[1:]))
-        pool5_flat = tf.reshape(pool5, [-1, shape])
-        # fc1
-        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
-        fc1 = self.fc_layer('fc1', drop, [shape, 512])
-        # fc2
-        bn = self.batch_norm_relu(fc1, is_training)
-        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
-        fc2 = self.fc_layer('fc2', drop, [512, 512])
-
-        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
-
-        return fc3
-
-
-def run_benchmark():
-    """Run benchmark on cifar10 or flowers."""
-
-    if args.data_set == "cifar10":
-        class_dim = 10
-        raw_shape = (3, 32, 32)
-        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
-            None, 3, 32, 32)
-    else:
-        class_dim = 102
-        raw_shape = (3, 224, 224)
-        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
-            None, 3, 224, 224)
-
-    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
-
-    with tf.device(device):
-        images = tf.placeholder(tf.float32, shape=dat_shape)
-        labels = tf.placeholder(tf.int64, shape=(None, ))
-        is_training = tf.placeholder('bool')
-        onehot_labels = tf.one_hot(labels, depth=class_dim)
-
-        vgg16 = VGG16Model()
-        logits = vgg16.network(images, class_dim, is_training)
-        loss = tf.losses.softmax_cross_entropy(
-            onehot_labels=onehot_labels, logits=logits)
-        avg_loss = tf.reduce_mean(loss)
-
-        correct = tf.equal(tf.argmax(logits, 1), labels)
-        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
-        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
-        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        with tf.control_dependencies(update_ops):
-            train_op = optimizer.minimize(avg_loss)
-
-    # data reader
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-
-    # test
-    def test():
-        test_accs = []
-        for batch_id, data in enumerate(test_reader()):
-            test_images = np.array(
-         map(lambda x: np.transpose(x[0].reshape(raw_shape),
-         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
-            test_accs.append(
-                accuracy.eval(feed_dict={
-                    images: test_images,
-                    labels: test_labels,
-                    is_training: False
-                }))
-        return np.mean(test_accs)
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
-    config.gpu_options.allow_growth = True
-
-    with tf.Session(config=config) as sess:
-        init_g = tf.global_variables_initializer()
-        init_l = tf.local_variables_initializer()
-        sess.run(init_g)
-        sess.run(init_l)
-        iters, num_samples, start_time = 0, 0, time.time()
-        for pass_id in range(args.num_passes):
-            # train
-            num_samples = 0
-            start_time = time.time()
-            for batch_id, data in enumerate(train_reader()):
-                if iters == args.skip_batch_num:
-                    start_time = time.time()
-                    num_samples = 0
-                if iters == args.iterations:
-                    break
-                train_images = np.array(
-                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
-                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-                train_labels = np.array(map(lambda x: x[1], data)).astype(
-                    'int64')
-                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
-                                        feed_dict={
-                                            images: train_images,
-                                            labels: train_labels,
-                                            is_training: True
-                                        })
-                iters += 1
-                num_samples += len(data)
-                print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
-                      (pass_id, iters, loss, acc))
-            train_elapsed = time.time() - start_time
-            # test
-            pass_test_acc = test()
-            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
-                  (pass_id, num_samples / train_elapsed, pass_test_acc))
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    print_arguments()
-    run_benchmark()