remove benchmark folder, since there is a benchmark repo already, distributed...

remove benchmark folder, since there is a benchmark repo already, distributed benchmark will be maintained in fleet repo (#18537) test=develop

remove benchmark folder, since there is a benchmark repo already, distributed...
remove benchmark folder, since there is a benchmark repo already, distributed benchmark will be maintained in fleet repo (#18537) test=develop
6f6ecbec · guru4elephant · Tao Luo · 1f1cc222 · 1f1cc222 · 1f1cc222
59 changed file
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
-paddle/image/logs
-paddle/image/*.pyc
-paddle/image/train.list
-paddle/rnn/logs
-paddle/rnn/*.pyc
-paddle/rnn/imdb.pkl
-caffe/image/logs
-tensorflow/image/logs
-tensorflow/rnn/logs
-fluid/models/*.pyc
-fluid/logs
-fluid/nohup.out
--- a/benchmark/caffe/image/alexnet.prototxt
+++ b/benchmark/caffe/image/alexnet.prototxt
-name: "alexnet"
-input: "data"
-input_dim: 64
-input_dim: 3
-input_dim: 227
-input_dim: 227
-input: "label"
-input_dim: 64
-input_dim: 1
-input_dim: 1
-input_dim: 1 
-force_backward: true
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "drop6"
-  type: "Dropout"
-  bottom: "fc6"
-  top: "fc6"
-  dropout_param {
-    dropout_ratio: 0.5
-  }
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "drop7"
-  type: "Dropout"
-  bottom: "fc7"
-  top: "fc7"
-  dropout_param {
-    dropout_ratio: 0.5
-  }
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
--- a/benchmark/caffe/image/googlenet.prototxt
+++ b/benchmark/caffe/image/googlenet.prototxt
--- a/benchmark/caffe/image/run.sh
+++ b/benchmark/caffe/image/run.sh
-set -e
-
-function test() {
-  cfg=$1
-  batch=$2
-  prefix=$3
-  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg 
-  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
-  caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# alexnet
-test alexnet.prototxt 64 alexnet 
-test alexnet.prototxt 128 alexnet 
-test alexnet.prototxt 256 alexnet 
-test alexnet.prototxt 512 alexnet 
-
-# googlenet
-test googlenet.prototxt 64 googlenet 
-test googlenet.prototxt 128 googlenet 
-
-# small net 
-test smallnet_mnist_cifar.prototxt 64 smallnet 
-test smallnet_mnist_cifar.prototxt 128 smallnet 
-test smallnet_mnist_cifar.prototxt 256 smallnet 
-test smallnet_mnist_cifar.prototxt 512 smallnet 
--- a/benchmark/caffe/image/run_multi.sh
+++ b/benchmark/caffe/image/run_multi.sh
-#!/bin/bash
-set -e
-
-function test() {
-  cfg=$1
-  batch=$2
-  prefix=$3
-  batch_per_gpu=`expr ${batch} / 4`
-  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
-  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
-  sed -i "1c\net : \"${cfg}\"" solver.prototxt
-  caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# alexnet
-test alexnet.prototxt 512 alexnet 
-test alexnet.prototxt 1024 alexnet 
-
-# googlnet 
-test googlenet.prototxt 512 googlenet 
--- a/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
+++ b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
-name: "mnist/cifar"
-input: "data"
-input_dim: 128 
-input_dim: 3
-input_dim: 32 
-input_dim: 32 
-input: "label"
-input_dim: 128 
-input_dim: 1
-input_dim: 1
-input_dim: 1 
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  convolution_param {
-    num_output: 32
-    pad: 2
-    kernel_size: 5
-    stride: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.0001
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "conv1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "pool1"
-  top: "pool1"
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  convolution_param {
-    num_output: 32
-    pad: 2
-    kernel_size: 5
-    stride: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "conv2"
-  top: "pool2"
-  pooling_param {
-    pool: AVE
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  convolution_param {
-    num_output: 64
-    pad: 2
-    kernel_size: 5
-    stride: 1
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "pool3"
-  type: "Pooling"
-  bottom: "conv3"
-  top: "pool3"
-  pooling_param {
-    pool: AVE
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "ip1"
-  type: "InnerProduct"
-  bottom: "pool3"
-  top: "ip1"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  inner_product_param {
-    num_output: 64
-    weight_filler {
-      type: "gaussian"
-      std: 0.1
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "ip2"
-  type: "InnerProduct"
-  bottom: "ip1"
-  top: "ip2"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  inner_product_param {
-    num_output: 10
-    weight_filler {
-      type: "gaussian"
-      std: 0.1
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "accuracy"
-  type: "Accuracy"
-  bottom: "ip2"
-  bottom: "label"
-  top: "accuracy"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "ip2"
-  bottom: "label"
-  top: "loss"
-}
--- a/benchmark/caffe/image/solver.prototxt
+++ b/benchmark/caffe/image/solver.prototxt
-net: "alexnet.prototxt"
-base_lr: 0.01
-lr_policy: "fixed"
-display: 20
-max_iter: 200
-momentum: 0.9
-weight_decay: 0.0005
-snapshot: 10000
-snapshot_prefix: "models/caffe_alexnet_train"
-solver_mode: GPU
--- a/benchmark/figs/alexnet-4gpu.png
+++ b/benchmark/figs/alexnet-4gpu.png
--- a/benchmark/figs/alexnet-cpu-infer.png
+++ b/benchmark/figs/alexnet-cpu-infer.png
--- a/benchmark/figs/alexnet-cpu-train.png
+++ b/benchmark/figs/alexnet-cpu-train.png
--- a/benchmark/figs/googlenet-4gpu.png
+++ b/benchmark/figs/googlenet-4gpu.png
--- a/benchmark/figs/googlenet-cpu-infer.png
+++ b/benchmark/figs/googlenet-cpu-infer.png
--- a/benchmark/figs/googlenet-cpu-train.png
+++ b/benchmark/figs/googlenet-cpu-train.png
--- a/benchmark/figs/resnet-cpu-infer.png
+++ b/benchmark/figs/resnet-cpu-infer.png
--- a/benchmark/figs/resnet-cpu-train.png
+++ b/benchmark/figs/resnet-cpu-train.png
--- a/benchmark/figs/rnn_lstm_4gpus.png
+++ b/benchmark/figs/rnn_lstm_4gpus.png
--- a/benchmark/figs/rnn_lstm_cls.png
+++ b/benchmark/figs/rnn_lstm_cls.png
--- a/benchmark/figs/vgg-cpu-infer.png
+++ b/benchmark/figs/vgg-cpu-infer.png
--- a/benchmark/figs/vgg-cpu-train.png
+++ b/benchmark/figs/vgg-cpu-train.png
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-
-# Use UBUNTU_MIRROR can speed up apt-get speed.
-# ARG UBUNTU_MIRROR
-# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
-RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
-
-# IMPORTANT:
-# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
-# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
-
-
-RUN pip install -U pip
-RUN pip install -U kubernetes paddlepaddle
-
-RUN pip uninstall -y paddlepaddle && mkdir /workspace
-
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN chmod +x /usr/bin/paddle_k8s
-
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl 
-
-ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
-ADD models/ /workspace/models/
-
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
-# Fluid Benchmark
-
-This directory contains several models configurations and tools that used to run
-Fluid benchmarks for local and distributed training.
-
-
-## Run the Benchmark
-
-To start, run the following command to get the full help message:
-
-```bash
-python fluid_benchmark.py --help
-```
-
-Currently supported `--model` argument include:
-
-* mnist
-* resnet
-    * you can chose to use different dataset using `--data_set cifar10` or
-      `--data_set flowers`.
-* vgg
-* stacked_dynamic_lstm
-* machine_translation
-
-* Run the following command to start a benchmark job locally:
-    ```bash
-      python fluid_benchmark.py --model mnist --device GPU
-    ```
-    You can choose to use GPU/CPU training. With GPU training, you can specify
-    `--gpus <gpu_num>` to run multi GPU training.
-    You can set async mode parameter server. With async mode, you can specify
-    `--async_mode` to train model asynchronous.
-* Run distributed training with parameter servers:
-    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
-    * start parameter servers:
-        ```bash
-        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
-        sleep 15
-        ```
-    * start trainers:
-        ```bash
-        PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
-        ```
-* Run distributed training using NCCL2
-    ```bash
-    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
-    ```
-
-## Prepare the RecordIO file to Achieve Better Performance
-
-Run the following command will generate RecordIO files like "mnist.recordio" under the path
-and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
-at any time using `fluid.batch`.
-
-```bash
-python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
-```
-
-## Run Distributed Benchmark on Kubernetes Cluster
-
-You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
-have to start all those processes manually on each node, which is not recommended.
-
-To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
-download it from
-http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
-build it by your own. Once you've got the "whl" package, put it under the current directory and run:
-
-```bash
-docker build -t [your docker image name]:[your docker image tag] .
-```
-
-Then push the image to a Docker registry that your Kubernetes cluster can reach.
-
-We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
-distributed benchmark jobs to your cluster. To generate a job yaml, just run:
-
-```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
-```
-
-Then the yaml files are generated under directory `myjob`, you can run:
-
-```bash
-kubectl create -f myjob/
-```
-
-The job shall start.
-
-
-## Notes for Run Fluid Distributed with NCCL2 and RDMA
-
-Before running NCCL2 distributed jobs, please check that whether your node has multiple network
-interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
-network device.
-
-To run high-performance distributed training, you must prepare your hardware environment to be
-able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
-note for details.
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-__all__ = ['parse_args', ]
-
-BENCHMARK_MODELS = [
-    "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
-    "stacked_dynamic_lstm", "resnet_with_preprocess"
-]
-
-
-def parse_args():
-    parser = argparse.ArgumentParser('Fluid model benchmarks.')
-    parser.add_argument(
-        '--model',
-        type=str,
-        choices=BENCHMARK_MODELS,
-        default='resnet',
-        help='The model to run benchmark with.')
-    parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
-    #  args related to learning rate
-    parser.add_argument(
-        '--learning_rate', type=float, default=0.001, help='The learning rate.')
-    # TODO(wuyi): add "--use_fake_data" option back.
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=100, help='The number of passes.')
-    parser.add_argument(
-        '--data_format',
-        type=str,
-        default='NCHW',
-        choices=['NCHW', 'NHWC'],
-        help='The data data_format, now only support NCHW.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--gpus',
-        type=int,
-        default=1,
-        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
-    # this option is available only for vgg and resnet.
-    parser.add_argument(
-        '--cpus',
-        type=int,
-        default=1,
-        help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
-    parser.add_argument(
-        '--data_set',
-        type=str,
-        default='flowers',
-        choices=['cifar10', 'flowers', 'imagenet'],
-        help='Optional dataset for benchmark.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--no_test',
-        action='store_true',
-        help='If set, do not test the testset during training.')
-    parser.add_argument(
-        '--memory_optimize',
-        action='store_true',
-        help='If set, optimize runtime memory before start.')
-    parser.add_argument(
-        '--use_fake_data',
-        action='store_true',
-        help='If set ommit the actual read data operators.')
-    parser.add_argument(
-        '--profile', action='store_true', help='If set, profile a few steps.')
-    parser.add_argument(
-        '--update_method',
-        type=str,
-        default='local',
-        choices=['local', 'pserver', 'nccl2'],
-        help='Choose parameter update method, can be local, pserver, nccl2.')
-    parser.add_argument(
-        '--no_split_var',
-        action='store_true',
-        default=False,
-        help='Whether split variables into blocks when update_method is pserver')
-    parser.add_argument(
-        '--async_mode',
-        action='store_true',
-        default=False,
-        help='Whether start pserver in async mode to support ASGD')
-    parser.add_argument(
-        '--use_reader_op',
-        action='store_true',
-        help='Whether to use reader op, and must specify the data path if set this to true.'
-    )
-    parser.add_argument(
-        '--data_path',
-        type=str,
-        default="",
-        help='Directory that contains all the training recordio files.')
-    parser.add_argument(
-        '--test_data_path',
-        type=str,
-        default="",
-        help='Directory that contains all the test data (NOT recordio).')
-    parser.add_argument(
-        '--use_inference_transpiler',
-        action='store_true',
-        help='If set, use inference transpiler to optimize the program.')
-    parser.add_argument(
-        '--no_random',
-        action='store_true',
-        help='If set, keep the random seed and do not shuffle the data.')
-    parser.add_argument(
-        '--reduce_strategy',
-        type=str,
-        choices=['reduce', 'all_reduce'],
-        default='all_reduce',
-        help='Specify the reduce strategy, can be reduce, all_reduce')
-    parser.add_argument(
-        '--fuse_broadcast_op',
-        action='store_true',
-        help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.'
-    )
-    args = parser.parse_args()
-    return args
--- a/benchmark/fluid/check_env.sh
+++ b/benchmark/fluid/check_env.sh
-#!/bin/bash
-
-if [ "`uname -s`" != "Linux" ]; then
-  echo "Current scenario only support in Linux yet!"
-  exit 0
-fi
-
-echo "========================= Hardware Information ========================="
-sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
-cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
-ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
-physical_cores=$((sockets * cores_per_socket))
-virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
-numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
-echo "CPU Name               : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
-echo "CPU Family             : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
-echo "Socket Number          : $sockets"
-echo "Cores Per Socket       : $cores_per_socket"
-echo "Total Physical Cores   : $physical_cores"
-echo "Total Virtual Cores    : $virtual_cores"
-if [ $ht -eq 1 ]; then
-  echo "Hyper Threading        : OFF"
-  if [ $physical_cores -ne $virtual_cores ]; then
-    echo "Error: HT logical error"
-  fi
-else
-  echo "Hyper Threading        : ON"
-  if [ $physical_cores -ge $virtual_cores ]; then
-    echo "Error: HT logical error"
-  fi
-fi
-echo "NUMA Nodes             : $numa_nodes"
-if [ $numa_nodes -lt $sockets ]; then
-  echo "Warning: NUMA node is not enough for the best performance,\
- at least $sockets"
-fi
-
-echo "-------------------------- Memory Information --------------------------"
-# dmidecode support start from 2.11
-dmi_ver=`dmidecode --version|awk -F '.' '{print $1}'|xargs`
-if [ $dmi_ver -lt 2 ]; then
-  echo "Error: dmidecode unknown or version is too old"
-  exit 0
-fi
-if [ `dmidecode | grep -ic "Permission denied"` -ne 0 ]; then
-  echo "Error: need root to run dmidecode"
-  exit 0
-fi
-max_dimms=0
-num_dimms_installed=0
-for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do
-  num_refered=`dmidecode |grep -wc "$dimm_id"`
-  # the actual dimm id should be refered only once
-  if [ $num_refered -eq 1 ]; then
-    num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1; f=0};
-      /Unknown/ {f=1};
-      /Manufacturer/ {if (s==1) {print f; exit 0;}};'`
-    if [ $num_unknown -eq 0 ]; then
-      dimms_installed="$dimms_installed \n $dimm_id"
-      ((num_dimms_installed++))
-    else
-      dimms_uninstalled="$dimms_uninstalled \n $dimm_id"
-    fi
-    ((max_dimms++))
-  fi
-done
-echo "Installed DIMM number  : $num_dimms_installed"
-num_dimms_mapped=`dmidecode | grep "Memory Device Mapped" | wc -l`
-if [ $num_dimms_installed -ne $num_dimms_mapped ]; then
-  echo "Error: The installed DIMMs number does ont match the mapped memory device: $num_dimms_mapped"
-fi
-num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -ic "Hz"`
-if [ $num_dimms_installed -ne $num_clock_configed ]; then
-  echo "Error: The installed DIMMs number does ont match configured clocks: $num_clock_configed"
-fi
-echo -e "Installed DIMMs Locator: $dimms_installed"
-echo -e "Not installed DIMMs    : $dimms_uninstalled"
-max_dimm_slots=`dmidecode | grep -c "Bank Locator"`
-echo "DIMMs max slots        : $max_dimm_slots"
-if [ $max_dimms -ne $max_dimm_slots ]; then
-  echo "Error: The max dimm slots do not match the max dimms: $max_dimms"
-fi
-free_ver_main=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $1}'`
-free_ver_sub=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $2}'`
-if [ $free_ver_main -lt 3 ] || [ $free_ver_sub -lt 3 ]; then
-  mem_sz=`free |grep -i mem |awk -F' ' '{print $2}'|xargs`
-  swap_sz=`free |grep -i swap |awk -F' ' '{print $2}'|xargs`
-  total_sz=`free -t |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
-  mem_sz="`awk 'BEGIN{printf "%.1f\n",('$mem_sz'/1024/1024)}'` GB" 
-  swap_sz="`awk 'BEGIN{printf "%.1f\n",('$swap_sz'/1024/1024)}'` GB"
-  total_sz="`awk 'BEGIN{printf "%.1f\n",('$total_sz'/1024/1024)}'` GB"
-else
-  mem_sz=`free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`
-  swap_sz=`free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`
-  total_sz=`free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
-fi
-echo "Memory Size            : $mem_sz"
-echo "Swap Memory Size       : $swap_sz"
-echo "Total Memory Size      : $total_sz"
-echo "Max Memory Capacity    : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`"
-# DIMMs fequency
-clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs`
-echo "Configed Clock Speed   : $clock_speeds"
-num_clock_type=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | wc -l`
-if [ $num_clock_type -ne 1 ]; then
-  echo "Warning: Have more than 1 speed type, all DIMMs should have same fequency: $clock_speeds"
-fi
-
-echo "-------------------------- Turbo Information  --------------------------"
-scaling_drive=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver`
-echo "Scaling Driver         : $scaling_drive"
-if [ $scaling_drive == "intel_pstate" ] && [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
-  turbo=`cat /sys/devices/system/cpu/intel_pstate/no_turbo`
-  if [ $turbo -eq 1 ]; then
-    echo "Turbo Status           : OFF"
-  else
-    echo "Turbo Status           : ON"
-  fi
-else
-  echo "Warning: Scaling driver is not intel_pstarte, maybe should enable it in BIOS"
-  echo "Turbo Status           : Unknown"
-fi
-# cpu frequency
-num_max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| sort -u |wc -l`
-num_min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| sort -u |wc -l`
-if [ $num_max_freq -ne 1 ]; then
-  echo "Error: the max_frequency of all CPU should be equal"
-fi
-if [ $num_min_freq -ne 1 ]; then
-  echo "Error: the min_frequency of all CPU should be equal"
-fi
-max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| uniq|xargs` # kHz
-max_freq=`awk 'BEGIN{printf "%.2f",('$max_freq' / 1000000)}'` # GHz
-min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| uniq|xargs` # kHz
-min_freq=`awk 'BEGIN{printf "%.2f",('$min_freq' / 1000000)}'` # GHz
-echo "CPU Max Frequency      : $max_freq GHz"
-echo "CPU Min Frequency      : $min_freq GHz"
-# cpu governor
-num_governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |wc -l`
-if [ $num_governor -ne 1 ]; then
-  echo "Error: the governor of all CPU should be the same"
-fi
-governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |uniq`
-echo "CPU Freq Governor      : $governor"
-
-
-echo "========================= Software Information ========================="
-echo "BIOS Release Date      : `dmidecode | grep "Release Date"|awk -F ':' '{print $2}'|xargs`"
-echo "OS Version             : `cat /etc/redhat-release`"
-echo "Kernel Release Version : `uname -r`"
-echo "Kernel Patch Version   : `uname -v`"
-echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
-if command -v cmake >/dev/null 2>&1; then 
-  cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
-else
-  cmake_ver=" Not installed"
-fi
-echo "CMake Version          :$cmake_ver"
-echo "------------------ Environment Variables Information -------------------"
-kmp_affinity=`env | grep KMP_AFFINITY`
-omp_dynamic=`env | grep OMP_DYNAMIC`
-omp_nested=`env | grep OMP_NESTED`
-omp_num_threads=`env | grep OMP_NUM_THREADS`
-mkl_num_threads=`env | grep MKL_NUM_THREADS`
-mkl_dynamic=`env | grep MKL_DYNAMIC`
-if [ ! $kmp_affinity ]; then kmp_affinity="unset"; fi
-if [ ! $omp_dynamic ]; then omp_dynamic="unset"; fi
-if [ ! $omp_nested ]; then omp_nested="unset"; fi
-if [ ! $omp_num_threads ]; then omp_num_threads="unset"; fi
-if [ ! $mkl_num_threads ]; then mkl_num_threads="unset"; fi
-if [ ! $mkl_dynamic ]; then mkl_dynamic="unset"; fi
-echo "KMP_AFFINITY           : $kmp_affinity"
-echo "OMP_DYNAMIC            : $omp_dynamic"
-echo "OMP_NESTED             : $omp_nested"
-echo "OMP_NUM_THREADS        : $omp_num_threads"
-echo "MKL_NUM_THREADS        : $mkl_num_threads"
-echo "MKL_DYNAMIC            : $mkl_dynamic"
-# Check if any MKL related libraries have been installed in LD_LIBRARY_PATH
-for path in `echo $LD_LIBRARY_PATH | awk -F ':' '{for(i=1;i<=NF;++i)print $i}'`; do
-  mkldnn_found=`find $path -name "libmkldnn.so"`
-  if [ "$mkldnn_found" ]; then
-    echo "Found MKL-DNN          : $mkldnn_found"
-  fi
-  mklml_found=`find $path -name "libmklml_intel.so"`
-  if [ "$mklml_found" ]; then
-    echo "Found MKLML            : $mklml_found"
-  fi
-  iomp_found=`find $path -name "libiomp5.so"`
-  if [ "$iomp_found" ]; then
-    echo "Found IOMP             : $iomp_found"
-  fi
-done
-
-# dump all details for fully check
-lscpu > lscpu.dump
-dmidecode > dmidecode.dump
-
-# The expected result would be like:
-# ========================= Hardware Information =========================
-# CPU Name               : Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
-# CPU Family             : 6
-# Socket Number          : 2
-# Cores Per Socket       : 20
-# Total Physical Cores   : 40
-# Total Virtual Cores    : 40
-# Hyper Threading        : OFF
-# NUMA Nodes             : 2
-# -------------------------- Memory Information --------------------------
-# Installed DIMM number  : 12
-# Installed DIMMs Locator:
-#  CPU1_DIMM_A1
-#  CPU1_DIMM_B1
-#  CPU1_DIMM_C1
-#  CPU1_DIMM_D1
-#  CPU1_DIMM_E1
-#  CPU1_DIMM_F1
-#  CPU2_DIMM_A1
-#  CPU2_DIMM_B1
-#  CPU2_DIMM_C1
-#  CPU2_DIMM_D1
-#  CPU2_DIMM_E1
-#  CPU2_DIMM_F1
-# Not installed DIMMs    :
-#  CPU1_DIMM_A2
-#  CPU1_DIMM_B2
-#  CPU1_DIMM_C2
-#  CPU1_DIMM_D2
-#  CPU1_DIMM_E2
-#  CPU1_DIMM_F2
-#  CPU2_DIMM_A2
-#  CPU2_DIMM_B2
-#  CPU2_DIMM_C2
-#  CPU2_DIMM_D2
-#  CPU2_DIMM_E2
-#  CPU2_DIMM_F2
-# DIMMs max slots        : 24
-# Memory Size            : 376G
-# Swap Memory Size       : 4.0G
-# Total Memory Size      : 380G
-# Max Memory Capacity    : 2304 GB
-# Configed Clock Speed   : 2666 MHz
-# -------------------------- Turbo Information  --------------------------
-# Scaling Driver         : intel_pstate
-# Turbo Status           : ON
-# CPU Max Frequency      : 3.70 GHz
-# CPU Min Frequency      : 1.00 GHz
-# CPU Freq Governor      : performance
-# ========================= Software Information =========================
-# BIOS Release Date      : 03/10/2017
-# OS Version             : CentOS Linux release 7.3.1611 (Core)
-# Kernel Release Version : 3.10.0-514.el7.x86_64
-# Kernel Patch Version   : #1 SMP Tue Nov 22 16:42:41 UTC 2016
-# GCC Version            : 4.8.5 20150623 (Red Hat 4.8.5-11)
-# CMake Version          : 3.5.2
-# ------------------ Environment Variables Information -------------------
-# KMP_AFFINITY           : unset
-# OMP_DYNAMIC            : unset
-# OMP_NESTED             : unset
-# OMP_NUM_THREADS        : unset
-# MKL_NUM_THREADS        : unset
-# MKL_DYNAMIC            : unset
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import cProfile
-import time
-import os
-import traceback
-
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
-
-from args import *
-
-
-def append_nccl2_prepare(trainer_id, startup_prog):
-    if trainer_id >= 0:
-        # append gen_nccl_id at the end of startup program
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-        port = os.getenv("PADDLE_PSERVER_PORT")
-        worker_ips = os.getenv("PADDLE_TRAINER_IPS")
-        worker_endpoints = []
-        for ip in worker_ips.split(","):
-            worker_endpoints.append(':'.join([ip, port]))
-        num_trainers = len(worker_endpoints)
-        current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
-        worker_endpoints.remove(current_endpoint)
-
-        nccl_id_var = startup_prog.global_block().create_var(
-            name="NCCLID",
-            persistable=True,
-            type=fluid.core.VarDesc.VarType.RAW)
-        startup_prog.global_block().append_op(
-            type="gen_nccl_id",
-            inputs={},
-            outputs={"NCCLID": nccl_id_var},
-            attrs={
-                "endpoint": current_endpoint,
-                "endpoint_list": worker_endpoints,
-                "trainer_id": trainer_id
-            })
-        return nccl_id_var, num_trainers, trainer_id
-    else:
-        raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
-                        "nccl-based dist train.")
-
-
-def dist_transpile(trainer_id, args, train_prog, startup_prog):
-    if trainer_id < 0:
-        return None, None
-
-    # the port of all pservers, needed by both trainer and pserver
-    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-    # comma separated ips of all pservers, needed by trainer and
-    # pserver
-    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
-    eplist = []
-    for ip in pserver_ips.split(","):
-        eplist.append(':'.join([ip, port]))
-    pserver_endpoints = ",".join(eplist)
-    # total number of workers/trainers in the job, needed by
-    # trainer and pserver
-    trainers = int(os.getenv("PADDLE_TRAINERS"))
-    # the IP of the local machine, needed by pserver only
-    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
-    # the role, should be either PSERVER or TRAINER
-    training_role = os.getenv("PADDLE_TRAINING_ROLE")
-
-    config = fluid.DistributeTranspilerConfig()
-    config.slice_var_up = not args.no_split_var
-    config.min_block_size = 1048576
-    t = distribute_transpiler.DistributeTranspiler(config=config)
-
-    t.transpile(
-        trainer_id,
-        # NOTE: *MUST* use train_prog, for we are using with guard to
-        # generate different program for train and test.
-        program=train_prog,
-        pservers=pserver_endpoints,
-        trainers=trainers,
-        sync_mode=not args.async_mode,
-        startup_program=startup_prog)
-    if training_role == "PSERVER":
-        pserver_program = t.get_pserver_program(current_endpoint)
-        pserver_startup_program = t.get_startup_program(
-            current_endpoint, pserver_program, startup_program=startup_prog)
-        return pserver_program, pserver_startup_program
-    elif training_role == "TRAINER":
-        train_program = t.get_trainer_program()
-        return train_program, startup_prog
-    else:
-        raise ValueError(
-            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
-        )
-
-
-def test_parallel(exe, test_args, args, test_prog, feeder):
-    acc_evaluators = []
-    for i in xrange(len(test_args[2])):
-        acc_evaluators.append(fluid.metrics.Accuracy())
-
-    to_fetch = [v.name for v in test_args[2]]
-    if args.use_reader_op:
-        test_args[4].start()
-        while True:
-            try:
-                acc_rets = exe.run(fetch_list=to_fetch)
-                for i, e in enumerate(acc_evaluators):
-                    e.update(
-                        value=np.array(acc_rets[i]), weight=args.batch_size)
-            except fluid.core.EOFException as eof:
-                test_args[4].reset()
-                break
-    else:
-        for batch_id, data in enumerate(test_args[3]()):
-            acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch)
-            for i, e in enumerate(acc_evaluators):
-                e.update(value=np.array(acc_rets[i]), weight=len(data))
-
-    return [e.eval() for e in acc_evaluators]
-
-
-# NOTE: only need to benchmark using parallelexe
-def train_parallel(train_args, test_args, args, train_prog, test_prog,
-                   startup_prog, nccl_id_var, num_trainers, trainer_id):
-    over_all_start = time.time()
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    feeder = None
-    if not args.use_reader_op:
-        feed_var_list = [
-            var for var in train_prog.global_block().vars.itervalues()
-            if var.is_data
-        ]
-        feeder = fluid.DataFeeder(feed_var_list, place)
-    # generate fake:
-    if args.use_fake_data:
-        for var in feed_var_list:
-            v = startup_prog.global_block()._clone_variable(var)
-            var.persistable = True
-            v.persistable = True
-
-            real_shape = list(var.shape)
-            real_shape[0] = args.batch_size / args.gpus
-            startup_prog.global_block().append_op(
-                outputs={"Out": v},
-                type="fill_constant",
-                attrs={"shape": real_shape,
-                       "value": 1.0,
-                       "dtype": var.dtype})
-
-    if nccl_id_var and trainer_id == 0:
-        #FIXME(wuyi): wait other trainer to start listening
-        time.sleep(30)
-
-    startup_exe = fluid.Executor(place)
-    startup_exe.run(startup_prog)
-    strategy = fluid.ExecutionStrategy()
-    strategy.num_threads = args.cpus
-    strategy.allow_op_delay = False
-    build_strategy = fluid.BuildStrategy()
-    if args.reduce_strategy == "reduce":
-        build_strategy.reduce_strategy = fluid.BuildStrategy(
-        ).ReduceStrategy.Reduce
-    else:
-        build_strategy.reduce_strategy = fluid.BuildStrategy(
-        ).ReduceStrategy.AllReduce
-
-    avg_loss = train_args[0]
-
-    if args.update_method == "pserver":
-        # parameter server mode distributed training, merge
-        # gradients on local server, do not initialize
-        # ParallelExecutor with multi server all-reduce mode.
-        num_trainers = 1
-        trainer_id = 0
-
-    exe = fluid.ParallelExecutor(
-        True,
-        avg_loss.name,
-        main_program=train_prog,
-        exec_strategy=strategy,
-        build_strategy=build_strategy,
-        num_trainers=num_trainers,
-        trainer_id=trainer_id)
-
-    if not args.no_test:
-        if args.update_method == "pserver":
-            test_scope = None
-        else:
-            # NOTE: use an empty scope to avoid test exe using NCCLID
-            test_scope = fluid.Scope()
-        test_exe = fluid.ParallelExecutor(
-            True, main_program=test_prog, share_vars_from=exe)
-
-    for pass_id in range(args.pass_num):
-        num_samples = 0
-        iters = 0
-        start_time = time.time()
-        if not args.use_reader_op:
-            reader_generator = train_args[3]()  #train_reader
-        batch_id = 0
-        data = None
-        if args.use_reader_op:
-            train_args[4].start()
-        while True:
-            if not args.use_reader_op:
-                data = next(reader_generator, None)
-                if data == None:
-                    break
-            if args.profile and batch_id == 5:
-                profiler.start_profiler("All")
-                profiler.reset_profiler()
-            elif args.profile and batch_id == 10:
-                print("profiling total time: ", time.time() - start_time)
-                profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" %
-                                       (trainer_id, pass_id))
-            if iters == args.iterations:
-                reader_generator.close()
-                break
-
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            fetch_list = [avg_loss.name]
-            acc_name_list = [v.name for v in train_args[2]]
-            fetch_list.extend(acc_name_list)
-
-            if args.use_fake_data or args.use_reader_op:
-                try:
-                    fetch_ret = exe.run(fetch_list)
-                except fluid.core.EOFException as eof:
-                    break
-                except fluid.core.EnforceNotMet as ex:
-                    traceback.print_exc()
-                    break
-            else:
-                fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
-            if args.use_reader_op:
-                num_samples += args.batch_size * args.gpus
-            else:
-                num_samples += len(data)
-
-            iters += 1
-            if batch_id % 1 == 0:
-                fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
-                print("Pass %d, batch %d, loss %s, accucacys: %s" %
-                      (pass_id, batch_id, fetched_data[0], fetched_data[1:]))
-            batch_id += 1
-
-        print_train_time(start_time, time.time(), num_samples)
-        if args.use_reader_op:
-            train_args[4].reset()  # reset reader handle
-        else:
-            del reader_generator
-
-        if not args.no_test and test_args[2]:
-            test_feeder = None
-            if not args.use_reader_op:
-                test_feed_var_list = [
-                    var for var in test_prog.global_block().vars.itervalues()
-                    if var.is_data
-                ]
-                test_feeder = fluid.DataFeeder(test_feed_var_list, place)
-            test_ret = test_parallel(test_exe, test_args, args, test_prog,
-                                     test_feeder)
-            print("Pass: %d, Test Accuracy: %s\n" %
-                  (pass_id, [np.mean(np.array(v)) for v in test_ret]))
-
-    print("total train time: ", time.time() - over_all_start)
-
-
-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('----------- Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-def print_train_time(start_time, end_time, num_samples):
-    train_elapsed = end_time - start_time
-    examples_per_sec = num_samples / train_elapsed
-    print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-          (num_samples, train_elapsed, examples_per_sec))
-
-
-def print_paddle_envs():
-    print('----------- Configuration envs -----------')
-    for k in os.environ:
-        if "PADDLE_" in k:
-            print "ENV %s:%s" % (k, os.environ[k])
-    print('------------------------------------------------')
-
-
-def main():
-    args = parse_args()
-    print_arguments(args)
-    print_paddle_envs()
-    if args.no_random:
-        fluid.default_startup_program().random_seed = 1
-
-    # the unique trainer id, starting from 0, needed by trainer
-    # only
-    nccl_id_var, num_trainers, trainer_id = (
-        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
-
-    if args.use_cprof:
-        pr = cProfile.Profile()
-        pr.enable()
-
-    model_def = __import__("models.%s" % args.model, fromlist=["models"])
-
-    train_prog = fluid.Program()
-    test_prog = fluid.Program()
-    startup_prog = fluid.Program()
-
-    train_args = list(model_def.get_model(args, True, train_prog, startup_prog))
-    test_args = list(model_def.get_model(args, False, test_prog, startup_prog))
-
-    all_args = [train_args, test_args, args]
-
-    if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog,
-                                                  startup_prog)
-        if not train_prog:
-            raise Exception(
-                "Must configure correct environments to run dist train.")
-        all_args.extend([train_prog, test_prog, startup_prog])
-        if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
-            all_args.extend([nccl_id_var, num_trainers, trainer_id])
-            train_parallel(*all_args)
-        elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
-            # start pserver with Executor
-            server_exe = fluid.Executor(fluid.CPUPlace())
-            server_exe.run(startup_prog)
-            server_exe.run(train_prog)
-        exit(0)
-
-    # for other update methods, use default programs
-    all_args.extend([train_prog, test_prog, startup_prog])
-
-    if args.update_method == "nccl2":
-        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
-            trainer_id, startup_prog)
-
-    if args.device == "CPU":
-        raise Exception("Only support GPU perf with parallel exe")
-    all_args.extend([nccl_id_var, num_trainers, trainer_id])
-    train_parallel(*all_args)
-
-
-if __name__ == "__main__":
-    main()
--- a/benchmark/fluid/imagenet_reader.py
+++ b/benchmark/fluid/imagenet_reader.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import random
-import functools
-import numpy as np
-from threading import Thread
-import subprocess
-import time
-
-from Queue import Queue
-import paddle
-from PIL import Image, ImageEnhance
-
-random.seed(0)
-
-DATA_DIM = 224
-
-THREAD = int(os.getenv("PREPROCESS_THREADS", "10"))
-BUF_SIZE = 5120
-
-DATA_DIR = '/mnt/ImageNet'
-TRAIN_LIST = '/mnt/ImageNet/train.txt'
-TEST_LIST = '/mnt/ImageNet/val.txt'
-
-img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-
-
-def resize_short(img, target_size):
-    percent = float(target_size) / min(img.size[0], img.size[1])
-    resized_width = int(round(img.size[0] * percent))
-    resized_height = int(round(img.size[1] * percent))
-    img = img.resize((resized_width, resized_height), Image.LANCZOS)
-    return img
-
-
-def crop_image(img, target_size, center):
-    width, height = img.size
-    size = target_size
-    if center == True:
-        w_start = (width - size) / 2
-        h_start = (height - size) / 2
-    else:
-        w_start = random.randint(0, width - size)
-        h_start = random.randint(0, height - size)
-    w_end = w_start + size
-    h_end = h_start + size
-    img = img.crop((w_start, h_start, w_end, h_end))
-    return img
-
-
-def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
-    aspect_ratio = math.sqrt(random.uniform(*ratio))
-    w = 1. * aspect_ratio
-    h = 1. / aspect_ratio
-
-    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
-                (float(img.size[1]) / img.size[0]) / (h**2))
-    scale_max = min(scale[1], bound)
-    scale_min = min(scale[0], bound)
-
-    target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
-                                                             scale_max)
-    target_size = math.sqrt(target_area)
-    w = int(target_size * w)
-    h = int(target_size * h)
-
-    i = random.randint(0, img.size[0] - w)
-    j = random.randint(0, img.size[1] - h)
-
-    img = img.crop((i, j, i + w, j + h))
-    img = img.resize((size, size), Image.LANCZOS)
-    return img
-
-
-def rotate_image(img):
-    angle = random.randint(-10, 10)
-    img = img.rotate(angle)
-    return img
-
-
-def distort_color(img):
-    def random_brightness(img, lower=0.5, upper=1.5):
-        e = random.uniform(lower, upper)
-        return ImageEnhance.Brightness(img).enhance(e)
-
-    def random_contrast(img, lower=0.5, upper=1.5):
-        e = random.uniform(lower, upper)
-        return ImageEnhance.Contrast(img).enhance(e)
-
-    def random_color(img, lower=0.5, upper=1.5):
-        e = random.uniform(lower, upper)
-        return ImageEnhance.Color(img).enhance(e)
-
-    ops = [random_brightness, random_contrast, random_color]
-    random.shuffle(ops)
-
-    img = ops[0](img)
-    img = ops[1](img)
-    img = ops[2](img)
-
-    return img
-
-
-def process_image(sample, mode, color_jitter, rotate):
-    img_path = sample[0]
-
-    img = Image.open(img_path)
-    if mode == 'train':
-        if rotate: img = rotate_image(img)
-        img = random_crop(img, DATA_DIM)
-    else:
-        img = resize_short(img, target_size=256)
-        img = crop_image(img, target_size=DATA_DIM, center=True)
-    if mode == 'train':
-        if color_jitter:
-            img = distort_color(img)
-        if random.randint(0, 1) == 1:
-            img = img.transpose(Image.FLIP_LEFT_RIGHT)
-
-    if img.mode != 'RGB':
-        img = img.convert('RGB')
-
-    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
-    img -= img_mean
-    img /= img_std
-
-    if mode == 'train' or mode == 'val':
-        return img, sample[1]
-    elif mode == 'test':
-        return [img]
-
-
-class XmapEndSignal():
-    pass
-
-
-def xmap_readers(mapper,
-                 reader,
-                 process_num,
-                 buffer_size,
-                 order=False,
-                 print_queue_state=True):
-    end = XmapEndSignal()
-
-    # define a worker to read samples from reader to in_queue
-    def read_worker(reader, in_queue):
-        for i in reader():
-            in_queue.put(i)
-        in_queue.put(end)
-
-    # define a worker to read samples from reader to in_queue with order flag
-    def order_read_worker(reader, in_queue, file_queue):
-        in_order = 0
-        for i in reader():
-            in_queue.put((in_order, i))
-            in_order += 1
-        in_queue.put(end)
-
-    # define a worker to handle samples from in_queue by mapper
-    # and put mapped samples into out_queue
-    def handle_worker(in_queue, out_queue, mapper):
-        sample = in_queue.get()
-        while not isinstance(sample, XmapEndSignal):
-            r = mapper(sample)
-            out_queue.put(r)
-            sample = in_queue.get()
-        in_queue.put(end)
-        out_queue.put(end)
-
-    # define a worker to handle samples from in_queue by mapper
-    # and put mapped samples into out_queue by order
-    def order_handle_worker(in_queue, out_queue, mapper, out_order):
-        ins = in_queue.get()
-        while not isinstance(ins, XmapEndSignal):
-            order, sample = ins
-            r = mapper(sample)
-            while order != out_order[0]:
-                pass
-            out_queue.put(r)
-            out_order[0] += 1
-            ins = in_queue.get()
-        in_queue.put(end)
-        out_queue.put(end)
-
-    def xreader():
-        file_queue = Queue()
-        in_queue = Queue(buffer_size)
-        out_queue = Queue(buffer_size)
-        out_order = [0]
-        # start a read worker in a thread
-        target = order_read_worker if order else read_worker
-        t = Thread(target=target, args=(reader, in_queue))
-        t.daemon = True
-        t.start()
-        # start several handle_workers
-        target = order_handle_worker if order else handle_worker
-        args = (in_queue, out_queue, mapper, out_order) if order else (
-            in_queue, out_queue, mapper)
-        workers = []
-        for i in xrange(process_num):
-            worker = Thread(target=target, args=args)
-            worker.daemon = True
-            workers.append(worker)
-        for w in workers:
-            w.start()
-
-        sample = out_queue.get()
-        start_t = time.time()
-        while not isinstance(sample, XmapEndSignal):
-            yield sample
-            sample = out_queue.get()
-            if time.time() - start_t > 3:
-                if print_queue_state:
-                    print("queue sizes: ", in_queue.qsize(), out_queue.qsize())
-                start_t = time.time()
-        finish = 1
-        while finish < process_num:
-            sample = out_queue.get()
-            if isinstance(sample, XmapEndSignal):
-                finish += 1
-            else:
-                yield sample
-
-    return xreader
-
-
-def _reader_creator(file_list,
-                    mode,
-                    shuffle=False,
-                    color_jitter=False,
-                    rotate=False,
-                    xmap=True):
-    def reader():
-        with open(file_list) as flist:
-            full_lines = [line.strip() for line in flist]
-            if shuffle:
-                random.shuffle(full_lines)
-            if mode == 'train':
-                trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-                trainer_count = int(os.getenv("PADDLE_TRAINERS"))
-                per_node_lines = len(full_lines) / trainer_count
-                lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
-                                   * per_node_lines]
-                print(
-                    "read images from %d, length: %d, lines length: %d, total: %d"
-                    % (trainer_id * per_node_lines, per_node_lines, len(lines),
-                       len(full_lines)))
-            else:
-                lines = full_lines
-
-            for line in lines:
-                if mode == 'train':
-                    img_path, label = line.split()
-                    img_path = img_path.replace("JPEG", "jpeg")
-                    img_path = os.path.join(DATA_DIR, "train", img_path)
-                    yield (img_path, int(label))
-                elif mode == 'val':
-                    img_path, label = line.split()
-                    img_path = img_path.replace("JPEG", "jpeg")
-                    img_path = os.path.join(DATA_DIR, "val", img_path)
-                    yield (img_path, int(label))
-                elif mode == 'test':
-                    img_path = os.path.join(DATA_DIR, line)
-                    yield [img_path]
-
-    mapper = functools.partial(
-        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
-
-    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
-
-
-def load_raw_image_uint8(sample):
-    img_arr = np.array(Image.open(sample[0])).astype('int64')
-    return img_arr, int(sample[1])
-
-
-def train_raw(file_list=TRAIN_LIST, shuffle=True):
-    def reader():
-        with open(file_list) as flist:
-            full_lines = [line.strip() for line in flist]
-            if shuffle:
-                random.shuffle(full_lines)
-
-            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-            trainer_count = int(os.getenv("PADDLE_TRAINERS"))
-            per_node_lines = len(full_lines) / trainer_count
-            lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) *
-                               per_node_lines]
-            print("read images from %d, length: %d, lines length: %d, total: %d"
-                  % (trainer_id * per_node_lines, per_node_lines, len(lines),
-                     len(full_lines)))
-
-            for line in lines:
-                img_path, label = line.split()
-                img_path = img_path.replace("JPEG", "jpeg")
-                img_path = os.path.join(DATA_DIR, "train", img_path)
-                yield (img_path, int(label))
-
-    return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD,
-                                      BUF_SIZE)
-
-
-def train(file_list=TRAIN_LIST, xmap=True):
-    return _reader_creator(
-        file_list,
-        'train',
-        shuffle=True,
-        color_jitter=False,
-        rotate=False,
-        xmap=xmap)
-
-
-def val(file_list=TEST_LIST, xmap=True):
-    return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap)
-
-
-def test(file_list=TEST_LIST):
-    return _reader_creator(file_list, 'test', shuffle=False)
-
-
-if __name__ == "__main__":
-    c = 0
-    start_t = time.time()
-    for d in train()():
-        c += 1
-        if c >= 10000:
-            break
-    spent = time.time() - start_t
-    print("read 10000 speed: ", 10000 / spent, spent)
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import yaml
-import copy
-import argparse
-import random
-import os
-import copy
-from kube_templates import pserver, trainer, envs
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Generate dist job yamls.')
-
-    parser.add_argument(
-        '--jobname', default="paddlejob", help='unique job name')
-    parser.add_argument(
-        '--cpu', default=1, type=int, help='CPU cores per trainer node')
-    parser.add_argument(
-        '--pscpu', default=1, type=int, help='CPU cores per pserver node')
-    parser.add_argument(
-        '--gpu', default=0, type=int, help='num of GPUs per node')
-    parser.add_argument(
-        '--image',
-        default="bootstrapper:5000/fluid_benchmark:gpu",
-        help='num of GPUs per node')
-    parser.add_argument(
-        '--pservers', default=1, type=int, help='num of pservers')
-    parser.add_argument(
-        '--trainers', default=1, type=int, help='num of trainers')
-    parser.add_argument('--memory', default=1, type=int, help='trainer memory')
-    parser.add_argument(
-        '--psmemory', default=1, type=int, help='pserver memory')
-    parser.add_argument(
-        '--port', default=30236, type=int, help='num of trainers')
-    parser.add_argument(
-        '--entry', default="python train.py", help='command to run')
-    parser.add_argument(
-        '--fluid', default=1, type=int, help='whether is fluid job')
-    parser.add_argument(
-        '--rdma', action='store_true', help='whether mount rdma libs')
-    parser.add_argument(
-        '--disttype',
-        default="pserver",
-        type=str,
-        choices=['pserver', 'nccl2', 'local'],
-        help='pserver or nccl2 or local')
-
-    args = parser.parse_args()
-    return args
-
-
-def gen_job():
-    ps = pserver
-    tn = trainer
-    args = parse_args()
-
-    ps_container = ps["spec"]["template"]["spec"]["containers"][0]
-    tn_container = tn["spec"]["template"]["spec"]["containers"][0]
-
-    if args.fluid == 1:
-        ps_container["command"] = \
-            ["paddle_k8s", "start_fluid"]
-        tn_container["command"] = \
-            ["paddle_k8s", "start_fluid"]
-    ps["metadata"]["name"] = args.jobname + "-pserver"
-    ps["spec"]["template"]["metadata"]["labels"][
-        "paddle-job-pserver"] = args.jobname
-    tn["metadata"]["name"] = args.jobname + "-trainer"
-    tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname
-
-    ps_container["image"] = args.image
-    tn_container["image"] = args.image
-
-    ps_container["resources"]["requests"]["cpu"] = str(args.pscpu)
-    ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi"
-    ps_container["resources"]["limits"]["cpu"] = str(args.pscpu)
-    ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi"
-
-    tn_container["resources"]["requests"]["cpu"] = str(args.cpu)
-    tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi"
-    tn_container["resources"]["limits"]["cpu"] = str(args.cpu)
-    tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi"
-    if args.gpu > 0:
-        tn_container["resources"]["requests"][
-            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
-        tn_container["resources"]["limits"][
-            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
-
-    ps["spec"]["replicas"] = int(args.pservers)
-    tn["spec"]["parallelism"] = int(args.trainers)
-    tn["spec"]["completions"] = int(args.trainers)
-    ps_container["ports"][0]["name"] = "jobport-" + str(args.port)
-    ps_container["ports"][0]["containerPort"] = args.port
-    spreadport = random.randint(40000, 60000)
-    tn_container["ports"][0]["name"] = "spr-" + str(spreadport)
-    tn_container["ports"][0]["containerPort"] = spreadport
-
-    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
-    envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
-    envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
-    envs.append({"name": "ENTRY", "value": args.entry})
-    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
-    # NOTE: these directories below are cluster specific, please modify
-    # this settings before you run on your own cluster.
-    envs.append({
-        "name": "LD_LIBRARY_PATH",
-        "value":
-        "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind"
-    })
-
-    volumes = [{
-        "name": "nvidia-driver",
-        "hostPath": {
-            "path": "/usr/local/nvidia/lib64"
-        }
-    }]
-    volumeMounts = [{
-        "mountPath": "/usr/local/nvidia/lib64",
-        "name": "nvidia-driver"
-    }]
-
-    if args.rdma:
-        volumes.extend([{
-            "name": "ibetc",
-            "hostPath": {
-                "path": "/etc/libibverbs.d"
-            }
-        }, {
-            "name": "iblibs",
-            "hostPath": {
-                "path": "/usr/local/rdma"
-            }
-        }, {
-            "name": "valgrind",
-            "hostPath": {
-                "path": "/usr/lib64/mlnx_ofed/valgrind"
-            }
-        }])
-        volumeMounts.extend([{
-            "mountPath": "/etc/libibverbs.d",
-            "name": "ibetc"
-        }, {
-            "mountPath": "/usr/local/rdma",
-            "name": "iblibs"
-        }, {
-            "mountPath": "/usr/lib64/mlnx_ofed/valgrind",
-            "name": "valgrind"
-        }])
-        # append shm for NCCL2
-        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
-        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
-
-    # add ceph volumes
-    volumes.append({
-        "name": "ceph-data",
-        "cephfs": {
-            "monitors": ["192.168.16.23:6789"],
-            "secretRef": {
-                "name": "ceph-secret"
-            },
-            "user": "admin",
-        }
-    })
-    volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
-
-    tn["spec"]["template"]["spec"]["volumes"] = volumes
-    tn_container["volumeMounts"] = volumeMounts
-
-    ps_container["env"] = copy.deepcopy(envs)
-    ps_container["env"].append({
-        "name": "PADDLE_TRAINING_ROLE",
-        "value": "PSERVER"
-    })
-    tn_container["env"] = envs
-    if args.disttype == "pserver":
-        tn_container["env"].append({
-            "name": "PADDLE_TRAINING_ROLE",
-            "value": "TRAINER"
-        })
-    elif args.disttype == "nccl2" or args.disttype == "local":
-        # NCCL2 have no training role, set to plain WORKER
-        tn_container["env"].append({
-            "name": "PADDLE_TRAINING_ROLE",
-            "value": "WORKER"
-        })
-
-    os.mkdir(args.jobname)
-    if args.disttype == "pserver":
-        with open("%s/pserver.yaml" % args.jobname, "w") as fn:
-            yaml.dump(ps, fn)
-
-    with open("%s/trainer.yaml" % args.jobname, "w") as fn:
-        yaml.dump(tn, fn)
-
-
-if __name__ == "__main__":
-    gen_job()
--- a/benchmark/fluid/kube_templates/__init__.py
+++ b/benchmark/fluid/kube_templates/__init__.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pserver import pserver
-from trainer import trainer
-
-__all__ = ["pserver", "trainer", "envs"]
-
-envs = [
-    # envs that don't need to change
-    {
-        "name": "GLOG_v",
-        "value": "0"
-    },
-    {
-        "name": "GLOG_logtostderr",
-        "value": "1"
-    },
-    {
-        "name": "TOPOLOGY",
-        "value": ""
-    },
-    {
-        "name": "TRAINER_PACKAGE",
-        "value": "/workspace"
-    },
-    {
-        "name": "PADDLE_INIT_NICS",
-        "value": "eth2"
-    },
-    {
-        "name": "NAMESPACE",
-        "valueFrom": {
-            "fieldRef": {
-                "fieldPath": "metadata.namespace"
-            }
-        }
-    },
-    {
-        "name": "POD_IP",
-        "valueFrom": {
-            "fieldRef": {
-                "fieldPath": "status.podIP"
-            }
-        }
-    },
-    {
-        "name": "PADDLE_CURRENT_IP",
-        "valueFrom": {
-            "fieldRef": {
-                "fieldPath": "status.podIP"
-            }
-        }
-    }
-]
--- a/benchmark/fluid/kube_templates/pserver.py
+++ b/benchmark/fluid/kube_templates/pserver.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pserver = {
-    "apiVersion": "extensions/v1beta1",
-    "kind": "ReplicaSet",
-    "metadata": {
-        "name": "jobname-pserver"
-    },
-    "spec": {
-        "replicas": 1,
-        "template": {
-            "metadata": {
-                "labels": {
-                    "paddle-job-pserver": "jobname"
-                }
-            },
-            "spec": {
-                "hostNetwork": True,
-                "imagePullSecrets": [{
-                    "name": "job-registry-secret"
-                }],
-                "containers": [{
-                    "name": "pserver",
-                    "image": "",
-                    "imagePullPolicy": "Always",
-                    "ports": [{
-                        "name": "jobport-1",
-                        "containerPort": 1
-                    }],
-                    "env": [],
-                    "command": ["paddle_k8s", "start_pserver"],
-                    "resources": {
-                        "requests": {
-                            "memory": "10Gi",
-                            "cpu": "4"
-                        },
-                        "limits": {
-                            "memory": "10Gi",
-                            "cpu": "4"
-                        }
-                    }
-                }]
-            }
-        }
-    }
-}
--- a/benchmark/fluid/kube_templates/trainer.py
+++ b/benchmark/fluid/kube_templates/trainer.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-trainer = {
-    "apiVersion": "batch/v1",
-    "kind": "Job",
-    "metadata": {
-        "name": "jobname-pserver"
-    },
-    "spec": {
-        "parallelism": 4,
-        "completions": 4,
-        "template": {
-            "metadata": {
-                "labels": {
-                    "paddle-job": "jobname"
-                }
-            },
-            "spec": {
-                "hostNetwork": True,
-                "imagePullSecrets": [{
-                    "name": "job-registry-secret"
-                }],
-                "restartPolicy": "Never",
-                "containers": [{
-                    "name": "trainer",
-                    "image": "",
-                    "imagePullPolicy": "Always",
-                    # to let container set rlimit
-                    "securityContext": {
-                        "privileged": True
-                        # TODO(wuyi): use below specific cap instead of privileged,
-                        # using privileged will cause all GPU device are visible
-                        # in the container.
-                        # "capabilities": {
-                        #     "add": ["SYS_RESOURCE"]
-                        # }
-                    },
-                    "ports": [{
-                        "name": "jobport-1",
-                        "containerPort": 1
-                    }],
-                    "env": [],
-                    "command": ["paddle_k8s", "start_trainer", "v2"],
-                    "resources": {
-                        "requests": {
-                            "memory": "10Gi",
-                            "cpu": "4",
-                        },
-                        "limits": {
-                            "memory": "10Gi",
-                            "cpu": "4",
-                        }
-                    }
-                }]
-            }
-        }
-    }
-}
--- a/benchmark/fluid/models/__init__.py
+++ b/benchmark/fluid/models/__init__.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
-    "resnet_with_preprocess"
-]
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""seq2seq model for fluid."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import distutils.util
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.framework as framework
-from paddle.fluid.executor import Executor
-
-
-def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
-    def linear(inputs):
-        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
-
-    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
-
-    cell_t = fluid.layers.sums(input=[
-        fluid.layers.elementwise_mul(
-            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
-                x=input_gate, y=cell_tilde)
-    ])
-
-    hidden_t = fluid.layers.elementwise_mul(
-        x=output_gate, y=fluid.layers.tanh(x=cell_t))
-
-    return hidden_t, cell_t
-
-
-def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
-                   target_dict_dim, is_generating, beam_size, max_length):
-    """Construct a seq2seq network."""
-
-    def bi_lstm_encoder(input_seq, gate_size):
-        # Linear transformation part for input gate, output gate, forget gate
-        # and cell activation vectors need be done outside of dynamic_lstm.
-        # So the output size is 4 times of gate_size.
-        input_forward_proj = fluid.layers.fc(input=input_seq,
-                                             size=gate_size * 4,
-                                             act=None,
-                                             bias_attr=False)
-        forward, _ = fluid.layers.dynamic_lstm(
-            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
-        input_reversed_proj = fluid.layers.fc(input=input_seq,
-                                              size=gate_size * 4,
-                                              act=None,
-                                              bias_attr=False)
-        reversed, _ = fluid.layers.dynamic_lstm(
-            input=input_reversed_proj,
-            size=gate_size * 4,
-            is_reverse=True,
-            use_peepholes=False)
-        return forward, reversed
-
-    src_word_idx = fluid.layers.data(
-        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
-
-    src_embedding = fluid.layers.embedding(
-        input=src_word_idx,
-        size=[source_dict_dim, embedding_dim],
-        dtype='float32')
-
-    src_forward, src_reversed = bi_lstm_encoder(
-        input_seq=src_embedding, gate_size=encoder_size)
-
-    encoded_vector = fluid.layers.concat(
-        input=[src_forward, src_reversed], axis=1)
-
-    encoded_proj = fluid.layers.fc(input=encoded_vector,
-                                   size=decoder_size,
-                                   bias_attr=False)
-
-    backward_first = fluid.layers.sequence_pool(
-        input=src_reversed, pool_type='first')
-
-    decoder_boot = fluid.layers.fc(input=backward_first,
-                                   size=decoder_size,
-                                   bias_attr=False,
-                                   act='tanh')
-
-    def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
-                                    decoder_boot, decoder_size):
-        def simple_attention(encoder_vec, encoder_proj, decoder_state):
-            decoder_state_proj = fluid.layers.fc(input=decoder_state,
-                                                 size=decoder_size,
-                                                 bias_attr=False)
-            decoder_state_expand = fluid.layers.sequence_expand(
-                x=decoder_state_proj, y=encoder_proj)
-            concated = fluid.layers.concat(
-                input=[encoder_proj, decoder_state_expand], axis=1)
-            attention_weights = fluid.layers.fc(input=concated,
-                                                size=1,
-                                                act='tanh',
-                                                bias_attr=False)
-            attention_weights = fluid.layers.sequence_softmax(
-                input=attention_weights)
-            weigths_reshape = fluid.layers.reshape(
-                x=attention_weights, shape=[-1])
-            scaled = fluid.layers.elementwise_mul(
-                x=encoder_vec, y=weigths_reshape, axis=0)
-            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
-            return context
-
-        rnn = fluid.layers.DynamicRNN()
-
-        cell_init = fluid.layers.fill_constant_batch_size_like(
-            input=decoder_boot,
-            value=0.0,
-            shape=[-1, decoder_size],
-            dtype='float32')
-        cell_init.stop_gradient = False
-
-        with rnn.block():
-            current_word = rnn.step_input(target_embedding)
-            encoder_vec = rnn.static_input(encoder_vec)
-            encoder_proj = rnn.static_input(encoder_proj)
-            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
-            cell_mem = rnn.memory(init=cell_init)
-            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
-            decoder_inputs = fluid.layers.concat(
-                input=[context, current_word], axis=1)
-            h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
-            rnn.update_memory(hidden_mem, h)
-            rnn.update_memory(cell_mem, c)
-            out = fluid.layers.fc(input=h,
-                                  size=target_dict_dim,
-                                  bias_attr=True,
-                                  act='softmax')
-            rnn.output(out)
-        return rnn()
-
-    if not is_generating:
-        trg_word_idx = fluid.layers.data(
-            name='target_sequence', shape=[1], dtype='int64', lod_level=1)
-
-        trg_embedding = fluid.layers.embedding(
-            input=trg_word_idx,
-            size=[target_dict_dim, embedding_dim],
-            dtype='float32')
-
-        prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
-                                                 encoded_proj, decoder_boot,
-                                                 decoder_size)
-        label = fluid.layers.data(
-            name='label_sequence', shape=[1], dtype='int64', lod_level=1)
-        cost = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-
-        feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
-
-        return avg_cost, feeding_list
-
-
-def lodtensor_to_ndarray(lod_tensor):
-    dims = lod_tensor.get_dims()
-    ndarray = np.zeros(shape=dims).astype('float32')
-    for i in xrange(np.product(dims)):
-        ndarray.ravel()[i] = lod_tensor.get_float_element(i)
-    return ndarray
-
-
-def get_model(args, is_train, main_prog, startup_prog):
-    if args.use_reader_op:
-        raise Exception("machine_translation do not support reader op for now.")
-    embedding_dim = 512
-    encoder_size = 512
-    decoder_size = 512
-    dict_size = 30000
-    beam_size = 3
-    max_length = 250
-
-    with fluid.program_guard(main_prog, startup_prog):
-        with fluid.unique_name.guard():
-            avg_cost, feeding_list = seq_to_seq_net(
-                embedding_dim,
-                encoder_size,
-                decoder_size,
-                dict_size,
-                dict_size,
-                False,
-                beam_size=beam_size,
-                max_length=max_length)
-    if is_train:
-        optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-        optimizer.minimize(avg_cost)
-
-    batch_generator = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size)
-            if is_train else paddle.dataset.wmt14.test(dict_size),
-            buf_size=1000),
-        batch_size=args.batch_size * args.gpus)
-
-    return avg_cost, optimizer, [], batch_generator, None
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import cProfile
-import os
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-
-SEED = 1
-DTYPE = "float32"
-
-# random seed must set before configuring the network.
-# fluid.default_startup_program().random_seed = SEED
-
-
-def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=data,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-
-    # TODO(dzhwinter) : refine the initializer and random seed settting
-    SIZE = 10
-    input_shape = conv_pool_2.shape
-    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
-    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
-
-    predict = fluid.layers.fc(
-        input=conv_pool_2,
-        size=SIZE,
-        act="softmax",
-        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale)))
-    return predict
-
-
-def get_model(args, is_train, main_prog, startup_prog):
-    # NOTE: mnist is small, we don't implement data sharding yet.
-    opt = None
-    data_file_handle = None
-    with fluid.program_guard(main_prog, startup_prog):
-        if args.use_reader_op:
-            filelist = [
-                os.path.join(args.data_path, f)
-                for f in os.listdir(args.data_path)
-            ]
-            data_file_handle = fluid.layers.open_files(
-                filenames=filelist,
-                shapes=[[-1, 1, 28, 28], (-1, 1)],
-                lod_levels=[0, 0],
-                dtypes=["float32", "int64"],
-                thread_num=1,
-                pass_num=1)
-            data_file = fluid.layers.double_buffer(
-                fluid.layers.batch(
-                    data_file_handle, batch_size=args.batch_size))
-        with fluid.unique_name.guard():
-            if args.use_reader_op:
-                input, label = fluid.layers.read_file(data_file)
-            else:
-                images = fluid.layers.data(
-                    name='pixel', shape=[1, 28, 28], dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
-
-            predict = cnn_model(images)
-            cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
-            # Evaluator
-            batch_acc = fluid.layers.accuracy(input=predict, label=label)
-            # Optimization
-            if is_train:
-                opt = fluid.optimizer.AdamOptimizer(
-                    learning_rate=0.001, beta1=0.9, beta2=0.999)
-                opt.minimize(avg_cost)
-                if args.memory_optimize:
-                    fluid.memory_optimize(main_prog)
-
-    # Reader
-    if is_train:
-        reader = paddle.dataset.mnist.train()
-    else:
-        reader = paddle.dataset.mnist.test()
-    batched_reader = paddle.batch(
-        reader, batch_size=args.batch_size * args.gpus)
-    return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
--- a/benchmark/fluid/models/resnet_with_preprocess.py
+++ b/benchmark/fluid/models/resnet_with_preprocess.py
--- a/benchmark/fluid/models/se_resnext.py
+++ b/benchmark/fluid/models/se_resnext.py
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
--- a/benchmark/fluid/recordio_converter.py
+++ b/benchmark/fluid/recordio_converter.py
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
--- a/benchmark/fluid/run_fluid_benchmark.sh
+++ b/benchmark/fluid/run_fluid_benchmark.sh
-#!/bin/bash
-
-PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
-
-sleep 15
-
-CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
-
-CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
--- a/benchmark/tensorflow/image/alexnet.py
+++ b/benchmark/tensorflow/image/alexnet.py
--- a/benchmark/tensorflow/image/alexnet_multi_gpu.py
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
--- a/benchmark/tensorflow/image/googlenet.py
+++ b/benchmark/tensorflow/image/googlenet.py
--- a/benchmark/tensorflow/image/googlenet_multi_gpu.py
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
--- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
--- a/benchmark/tensorflow/rnn/README.md
+++ b/benchmark/tensorflow/rnn/README.md
-You also should install tflearn:
-
-```bash
-pip install -r requirements.txt
-```
--- a/benchmark/tensorflow/rnn/reader.py
+++ b/benchmark/tensorflow/rnn/reader.py
--- a/benchmark/tensorflow/rnn/requirements.txt
+++ b/benchmark/tensorflow/rnn/requirements.txt
-tflearn
--- a/benchmark/tensorflow/rnn/rnn.py
+++ b/benchmark/tensorflow/rnn/rnn.py
--- a/benchmark/tensorflow/rnn/rnn_multi_gpu.py
+++ b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py