Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/faster_recordio

c3632b87 · yuyang18 · 15193c9e · 9503dbb1 · c3632b87 · c3632b87
48 changed file
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -4,6 +4,7 @@
 | backyes | Yan-Fei Wang |
 | baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
+| ChengduoZH | Cheng-Duo Zhao|
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |

--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
+RUN pip install -U pip
+RUN pip install -U kubernetes opencv-python paddlepaddle
+# IMPORTANT:
+# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
+RUN pip uninstall -y paddlepaddle && mkdir /workspace
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD fluid_benchmark.py dataset.py models/ /workspace/
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -44,11 +44,25 @@ Currently supported `--model` argument include:
 ## Run Distributed Benchmark on Kubernetes Cluster
+You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
+have to start all those processes mannually on each node, which is not recommended.
+To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
+download it from
+http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
+build it by your own. Once you've got the "whl" package, put it under the current directory and run:
+```bash
+docker build -t [your docker image name]:[your docker image tag] .
+```
+Then push the image to a Docker registry that your Kubernetes cluster can reach.
 We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
 distributed benchmark jobs to your cluster. To generate a job yaml, just run:
 ```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
 ```
 Then the yaml files are generated under directory `myjob`, you can run:

--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -37,7 +37,8 @@ nohup stdbuf -oL nvidia-smi \
      -l 1 &
 # mnist
 # mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=mnist \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
@@ -46,7 +47,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
 # vgg16
 # gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
@@ -54,7 +56,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
               2>&1 | tee -a vgg16_gpu_128.log
 # flowers gpu  128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
               --device=GPU \
               --batch_size=32 \
               --data_set=flowers \
@@ -64,40 +67,39 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
 # resnet50
 # resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet50 \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
-               --model=resnet_cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a resnet50_gpu_128.log
 # resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet50 \
               --device=GPU \
               --batch_size=64 \
               --data_set=flowers \
-               --model=resnet_imagenet \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a resnet50_gpu_flowers_64.log
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=stacked_dynamic_lstm \
               --device=GPU \
               --batch_size=32 \
               --skip_batch_num=5 \
               --iterations=30 \
-               --hidden_dim=512 \
-               --emb_dim=512 \
-               --crop_size=1500 \
               2>&1 | tee -a lstm_gpu_32.log
 # seq2seq
 # seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=machine_translation \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \

--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1009,3 +1009,9 @@ ____
 ..  autofunction:: paddle.fluid.layers.upsampling_bilinear2d
    :noindex:
+gather
+____
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
@@ -4,5 +4,5 @@
 .. toctree::
  :maxdepth: 1
+  inference/index_cn.rst
  optimization/index_cn.rst
-  inference/inference_support_in_fluid.md
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
@@ -5,4 +5,3 @@ HOW TO
  :maxdepth: 1
  optimization/index_en.rst
-  inference/inference_support_in_fluid.md
--- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
+安装与编译C++预测库
+===========================
+直接下载安装
+-------------
+======================   ========================================
+版本说明                            C++预测库   
+======================   ========================================
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+======================   ========================================
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
+=================   =========
+选项                 值   
+=================   =========
+CMAKE_BUILD_TYPE    Release
+FLUID_INSTALL_DIR   安装路径    
+WITH_FLUID_ONLY     ON（推荐）
+WITH_SWIG_PY        OFF（推荐
+WITH_PYTHON         OFF（推荐）
+WITH_GPU            ON/OFF
+WITH_MKL            ON/OFF
+=================   =========
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+  .. code-block:: bash
+     pip install paddlepaddle-gpu
+     PADDLE_ROOT=/path/of/capi
+     git clone https://github.com/PaddlePaddle/Paddle.git
+     cd Paddle
+     mkdir build
+     cd build
+     cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DWITH_FLUID_ONLY=ON \
+           -DWITH_SWIG_PY=OFF \
+           -DWITH_PYTHON=OFF \
+           -DWITH_MKL=OFF \
+           -DWITH_GPU=OFF  \
+           ..
+      make
+      make inference_lib_dist
+成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
+均会存放于PADDLE_ROOT目录中。目录结构如下：
+  .. code-block:: text
+     PaddleRoot/
+     ├── CMakeCache.txt
+     ├── paddle
+     │   └── fluid
+     │       ├── framework
+     │       ├── inference
+     │       ├── memory
+     │       ├── platform
+     │       ├── pybind
+     │       └── string
+     ├── third_party
+     │   ├── boost
+     │   │   └── boost
+     │   ├── eigen3
+     │   │   ├── Eigen
+     │   │   └── unsupported
+     │   └── install
+     │       ├── gflags
+     │       ├── glog
+     │       ├── mklml
+     │       ├── protobuf
+     │       ├── snappy
+     │       ├── snappystream
+     │       └── zlib
+     └── version.txt
+version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
+  .. code-block:: text
+     GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+     WITH_MKL: ON
+     WITH_GPU: ON
+     CUDA version: 8.0
+     CUDNN version: v5
--- a/doc/fluid/howto/inference/index_cn.rst
+++ b/doc/fluid/howto/inference/index_cn.rst
+预测库
+------------
+.. toctree::
+  :maxdepth: 1
+  build_and_install_lib_cn.rst
+  inference_support_in_fluid_cn.md
--- a/doc/fluid/howto/inference/inference_support_in_fluid.md
+++ b/doc/fluid/howto/inference/inference_support_in_fluid.md
-# Fluid Inference使用指南
+# 使用指南
 ## 目录：
 - Python Inference API
- 编译Fluid Inference库
 - Inference C++ API
 - Inference实例
 - Inference计算优化
@@ -55,62 +54,6 @@
    return [program, feed_target_names, fetch_targets]
  ```
-## 编译Fluid Inference库
-  - **不需要额外的CMake选项**
-    - 1、 配置CMake命令，更多配置请参考[源码编译PaddlePaddle](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html)
-      ```bash
-      $ git clone https://github.com/PaddlePaddle/Paddle.git
-      $ cd Paddle
-      $ mkdir build
-      $ cd build
-      $ cmake -DCMAKE_INSTALL_PREFIX=your/path/to/paddle_inference_lib \
-          -DCMAKE_BUILD_TYPE=Release \
-          -DWITH_PYTHON=ON \
-          -DWITH_MKL=OFF \
-          -DWITH_GPU=OFF \
-          ..
-      ```
-    - 2、 编译PaddlePaddle
-      ```bash
-      $ make
-      ```
-    - 3、 部署。执行如下命令将PaddlePaddle Fluid Inference库部署到`your/path/to/paddle_inference_lib`目录。
-      ```bash
-      $ make inference_lib_dist
-      ```
- 目录结构
-  ```bash
-  $ cd your/path/to/paddle_inference_lib
-  $ tree
-  .
-  |-- paddle
-  |   `-- fluid
-  |       |-- framework
-  |       |-- inference
-  |       |   |-- io.h
-  |       |   `-- libpaddle_fluid.so
-  |       |-- memory
-  |       |-- platform
-  |       `-- string
-  |-- third_party
-  |   |-- eigen3
-  |   `-- install
-  |       |-- gflags
-  |       |-- glog
-  |       `-- protobuf
-  `-- ...
-  ```
-  假设`PADDLE_ROOT=your/path/to/paddle_inference_lib`。
 ## 链接Fluid Inference库
 - 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))

--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -40,14 +40,23 @@ struct PaddleBuf {
 struct PaddleTensor {
  std::string name;  // variable name.
  std::vector<int> shape;
+  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
  PaddleBuf data;  // blob of data.
  PaddleDType dtype;
 };
+enum class PaddleEngineKind {
+  kNative = 0,  // Use the native Fluid facility.
+  // TODO(Superjomn) support following engines latter.
+  // kAnakin,             // Use Anakin for inference.
+  // kTensorRT,           // Use TensorRT for inference.
+  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
+  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+};
 /*
 * A simple Inference API for Paddle. Currently this API can be used by
 * non-sequence scenerios.
- * TODO(Superjomn) Support another API for NLP-related usages.
 */
 class PaddlePredictor {
 public:
@@ -69,15 +78,6 @@ class PaddlePredictor {
  // Destroy the Predictor.
  virtual ~PaddlePredictor() {}
-  enum class EngineKind {
-    kNative = -1,  // Use the native Fluid facility.
-    // TODO(Superjomn) support latter.
-    // kAnakin,             // Use Anakin for inference.
-    // kTensorRT,           // Use TensorRT for inference.
-    // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-    // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-  };
  // The common configs for all the predictors.
  struct Config {
    std::string model_dir;      // path to the model directory.
@@ -86,18 +86,24 @@ class PaddlePredictor {
 };
 struct NativeConfig : public PaddlePredictor::Config {
+  // GPU related fields.
  bool use_gpu{false};
-  int device;
+  int device{0};
-  float fraction_of_gpu_memory;
+  float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
  std::string prog_file;
  std::string param_file;
-  bool share_variables;
 };
-// A factory to help create difference predictor.
+// A factory to help create different predictors.
-template <
+//
-    typename ConfigT,
+// FOR EXTENSION DEVELOPER:
-    PaddlePredictor::EngineKind engine = PaddlePredictor::EngineKind::kNative>
+// Different predictors are designated by config type and engine kind. Similar
+// configs can be merged, but there shouldn't be a huge config containing
+// different fields for more than one kind of predictors.
+//
+// Similarly, each engine kind should map to a unique predictor implementation.
+template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -57,8 +57,7 @@ std::string num2str(T a) {
 bool NativePaddlePredictor::Init() {
  VLOG(3) << "Predictor::init()";
-  // TODO(panyx0718): Should CPU vs GPU device be decided by id?
+  if (config_.use_gpu) {
-  if (config_.device >= 0) {
    place_ = paddle::platform::CUDAPlace(config_.device);
  } else {
    place_ = paddle::platform::CPUPlace();
@@ -85,11 +84,13 @@ bool NativePaddlePredictor::Init() {
  }
  ctx_ = executor_->Prepare(*inference_program_, 0);
-  // Create variables
+  // Create temporary variables first, so that the first batch do not need to
-  // TODO(panyx0718): Why need to test share_variables here?
+  // create variables in the runtime. This is the logics of the old inference
-  if (config_.share_variables) {
+  // API.
-    executor_->CreateVariables(*inference_program_, scope_.get(), 0);
+  // TODO(Superjomn) this should be modified when `Clone` is valid for
-  }
+  // multi-thread application.
+  executor_->CreateVariables(*inference_program_, scope_.get(), 0);
  // Get the feed_target_names and fetch_target_names
  feed_target_names_ = inference_program_->GetFeedTargetNames();
  fetch_target_names_ = inference_program_->GetFetchTargetNames();
@@ -124,7 +125,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                scope_.get(),
                                &feed_targets,
                                &fetch_targets,
-                                !config_.share_variables);
+                                false /* don't create variable eatch time */);
  if (!GetFetch(fetchs, output_data)) {
    LOG(ERROR) << "fail to get fetchs";
    return false;
@@ -242,11 +243,14 @@ bool NativePaddlePredictor::GetFetch(
 template <>
 std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<NativeConfig, PaddlePredictor::EngineKind::kNative>(
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
    const NativeConfig &config) {
  VLOG(3) << "create NativePaddlePredictor";
  if (config.use_gpu) {
    // 1. GPU memeroy
+    PADDLE_ENFORCE(
+        config.fraction_of_gpu_memory > 0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
    std::vector<std::string> flags;
    if (config.fraction_of_gpu_memory >= 0.0f ||
        config.fraction_of_gpu_memory <= 0.95f) {

--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -47,7 +47,6 @@ NativeConfig GetConfig() {
  config.fraction_of_gpu_memory = 0.15;
  config.use_gpu = true;
  config.device = 0;
-  config.share_variables = true;
  return config;
 }

--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -200,7 +200,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
    vars_[var_desc.name()].reset(new VarDesc(var_desc));
  }
  for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDesc(op_desc, prog, this));
+    ops_.emplace_back(new OpDesc(op_desc, this));
  }
 }
@@ -209,7 +209,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
    : prog_(prog), desc_(desc) {
  need_update_ = true;
  for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
+    ops_.emplace_back(new OpDesc(*op, this));
  }
  for (auto &it : other.vars_) {
    auto *var = new VarDesc(*it.second);

--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -105,7 +105,7 @@ class BlockDesc {
  size_t OpSize() const { return ops_.size(); }
-  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
+  OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
  void Flush();

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -11,11 +11,15 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include <algorithm>
 #include <fstream>
+#include <string>
 #include <utility>
+#include <vector>
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
@@ -26,9 +30,6 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif
-#include <string>
-#include <vector>
 DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot",
              "the ssa graph path only print with GLOG_v=10,"
              "default /tmp/graph.dot");
@@ -148,9 +149,9 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    const ProgramDesc &program) const {
-  std::unordered_map<std::string, proto::VarType::Type> var_types;
+  std::unordered_map<std::string, VarDesc *> all_vars;
  for (auto *var : program.Block(0).AllVars()) {
-    var_types[var->Name()] = var->GetType();
+    all_vars[var->Name()] = var;
  }
  auto graph = new SSAGraph();
@@ -167,12 +168,28 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
  auto send_vars = FindDistTrainSendVars(program);
  auto recv_vars = FindDistTrainRecvVars(program);
-  size_t cur_device_id = 0;
  std::vector<std::unordered_set<std::string>> var_name_on_devices;
  std::vector<std::unordered_set<std::string>> bcast_var_name_set;
  var_name_on_devices.resize(places_.size());
  bcast_var_name_set.resize(places_.size());
+  size_t cur_device_id = 0;
+  std::vector<int64_t> balance_grads(places_.size(), 0);
+  auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
+    auto var_desc = all_vars.at(g_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GE(numel, 0);
+    auto smallest =
+        std::min_element(std::begin(balance_grads), std::end(balance_grads));
+    size_t dev_id =
+        static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
+    balance_grads[dev_id] += numel;
+    return dev_id;
+  };
  bool is_forwarding = true;
  for (auto *op : program.Block(0).AllOps()) {
    if (boost::get<int>(
@@ -220,13 +237,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
              switch (strategy_.reduce_) {
                case BuildStrategy::ReduceStrategy::kReduce:
+                  cur_device_id = get_appropriate_dev(g_name);
                  CreateReduceOp(&result, g_name, cur_device_id);
                  var_name_on_devices[cur_device_id].emplace(g_name);
                  bcast_var_name_set[cur_device_id].emplace(p_name);
-                  cur_device_id = (cur_device_id + 1) % places_.size();
                  break;
                case BuildStrategy::ReduceStrategy::kAllReduce:
-                  if (IsSparseGradient(var_types, g_name)) {
+                  if (IsSparseGradient(all_vars, g_name)) {
                    CreateReduceOp(&result, g_name, 0);
                    CreateBroadcastOp(&result, g_name, 0);
                  } else {
@@ -269,10 +286,10 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 }
 bool MultiDevSSAGraphBuilder::IsSparseGradient(
-    const std::unordered_map<std::string, proto::VarType::Type> &var_types,
+    const std::unordered_map<std::string, VarDesc *> &all_vars,
    const std::string &og) const {
-  PADDLE_ENFORCE(var_types.count(og) != 0);
+  PADDLE_ENFORCE(all_vars.count(og) != 0);
-  if (var_types.at(og) == proto::VarType::SELECTED_ROWS) {
+  if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
    return true;
  }
  return false;

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -106,7 +106,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                         size_t src_dev_id) const;
  bool IsSparseGradient(
-      const std::unordered_map<std::string, proto::VarType::Type> &var_types,
+      const std::unordered_map<std::string, VarDesc *> &all_vars,
      const std::string &og) const;
 private:

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -103,7 +103,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
  need_update_ = true;
 }
-OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
+OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
    : desc_(desc), need_update_(false) {
  // restore inputs_
  int input_size = desc_.inputs_size();

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -33,13 +33,14 @@ class OpDesc {
  OpDesc(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const AttributeMap &attrs);
-  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
+  OpDesc(const proto::OpDesc &desc, BlockDesc *block);
  explicit OpDesc(BlockDesc *block) : block_(block) {}
  OpDesc(const OpDesc &other, BlockDesc *block) {
    *this = other;
    block_ = block;
+    need_update_ = true;
  }
  void CopyFrom(const OpDesc &op_desc);

--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -51,12 +51,15 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
    auto *block = desc_.mutable_blocks(i);
    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
  }
-  for (auto &block : blocks_) {
+  for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {
-    for (auto *op : block->AllOps()) {
+    auto all_ops = blocks_[block_id]->AllOps();
-      for (const auto &attr : op->Proto()->attrs()) {
+    for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
-        if (attr.type() == proto::AttrType::BLOCK) {
+      auto &op = all_ops[op_id];
-          size_t blk_idx = attr.block_idx();
+      for (const std::string &attr_name : op->AttrNames()) {
-          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+        if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
+          int sub_block_id =
+              o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
+          op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
        }
      }
    }
@@ -86,6 +89,16 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
  for (auto &block_desc : *desc_.mutable_blocks()) {
    blocks_.emplace_back(new BlockDesc(this, &block_desc));
  }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -8,3 +8,5 @@ nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -24,7 +24,7 @@ class ReluOpConverter : public OpConverter {
  void operator()(const framework::proto::OpDesc& op) override {
    // Here the two nullptr looks strange, that's because the
    // framework::OpDesc's constructor is strange.
-    framework::OpDesc op_desc(op, nullptr, nullptr);
+    framework::OpDesc op_desc(op, nullptr);
    LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
                 "type is Relu";
    const nvinfer1::ITensor* input_tensor =

--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -21,7 +21,8 @@ namespace tensorrt {
 class Conv2dOpConverter : public OpConverter {
 public:
  Conv2dOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
    LOG(INFO)
        << "convert a fluid conv2d op to tensorrt conv layer without bias";
  }

--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
+// tensorflow.
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
+template <typename T>
+void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
+              T* odata, nvinfer1::DimsHW ostrides) {
+  for (int h = 0; h < shape.h(); ++h) {
+    for (int w = 0; w < shape.w(); ++w) {
+      odata[h * ostrides.h() + w * ostrides.w()] =
+          idata[h * ostrides.h() + w * ostrides.w()];
+    }
+  }
+}
+// Reorder the data layout from CK to KC.
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+                   TensorRTEngine::Weight* oweights) {
+  int c = iweights.dims[0];
+  int k = iweights.dims[1];
+  oweights->dims.assign({k, c});
+  nvinfer1::DimsHW istrides = {1, k};
+  nvinfer1::DimsHW ostrides = {c, 1};
+  Reorder2({k, c}, static_cast<float const*>(iweights.get().values), istrides,
+           static_cast<float*>(const_cast<void*>(oweights->get().values)),
+           ostrides);
+}
+/*
+ * FC converter convert a MUL op in Fluid to a FC layer in TRT.
+ */
+class FcOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
+    VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    // Declare inputs
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    // This may trigger a GPU->CPU copy, because TRT's weight can only be
+    // assigned from CPU memory, that can't be avoided.
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
+    size_t n_output = Y_t->dims()[1];
+    framework::LoDTensor tmp;
+    tmp.Resize(Y_t->dims());
+    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(),
+           Y_t->dims()[0] * Y_t->dims()[1]);
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(tmp.data<float>()),
+                                      Y_t->memory_size() / sizeof(float));
+    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
+    tmp_weight.dims = weight.dims;
+    // The data layout of TRT FC layer's weight is different from fluid's FC,
+    // need to reorder the elements.
+    ReorderCKtoKC(tmp_weight, &weight);
+    // Currently, the framework can only handle one fluid op -> one TRT layer,
+    // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
+    // handle `mul`, leave `add` as another layer.
+    // DEBUG
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
+                                       *const_cast<nvinfer1::ITensor*>(X),
+                                       n_output, weight.get(), bias.get());
+    auto output_name = op_desc.Output("Out").front();
+    engine_->DeclareOutput(layer, 0, output_name);
+  }
+};
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(mul);
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -24,10 +24,11 @@ namespace tensorrt {
 class MulOpConverter : public OpConverter {
 public:
  MulOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
-    VLOG(4) << "convert a fluid mul op to tensorrt fc layer without bias";
+                  const framework::Scope& scope) override {
+    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
-    framework::OpDesc op_desc(op, nullptr, nullptr);
+    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs
    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -31,27 +31,42 @@ namespace tensorrt {
 class OpConverter {
 public:
  OpConverter() {}
-  virtual void operator()(const framework::proto::OpDesc& op) {}
-  void Run(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
+  // Converter logic for an op.
-    std::string type = op.type();
+  virtual void operator()(const framework::proto::OpDesc& op,
-    auto* it = Registry<OpConverter>::Lookup(type);
+                          const framework::Scope& scope) {}
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type);
-    it->SetEngine(engine);
+  // Convert a single fluid operaotr and add the corresponding layer to TRT.
-    (*it)(op);
+  void ConvertOp(const framework::proto::OpDesc& op,
-  }
+                 const std::unordered_set<std::string>& parameters,
+                 const framework::Scope& scope, TensorRTEngine* engine) {
+    framework::OpDesc op_desc(op, nullptr);
+    OpConverter* it{nullptr};
-  // convert fluid op to tensorrt layer
+    if (op_desc.Type() == "mul") {
-  void ConvertOp(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
-    OpConverter::Run(op, engine);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        it = Registry<OpConverter>::Lookup("fc");
+      }
+    }
+    if (!it) {
+      it = Registry<OpConverter>::Lookup(op_desc.Type());
+    }
+    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                            op_desc.Type());
+    it->SetEngine(engine);
+    (*it)(op, scope);
  }
  // convert fluid block to tensorrt network
  void ConvertBlock(const framework::proto::BlockDesc& block,
-                    TensorRTEngine* engine) {
+                    const std::unordered_set<std::string>& parameters,
+                    const framework::Scope& scope, TensorRTEngine* engine) {
    for (int i = 0; i < block.ops_size(); i++) {
      const auto& op = block.ops(i);
-      OpConverter::Run(op, engine);
+      ConvertOp(op, parameters, scope, engine);
    }
  }

--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+TEST(fc_op, test) {
+  std::unordered_set<std::string> parameters({"mul-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(20, parameters, scope, 1000);
+  validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
+  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+  validator.SetOp(*desc.Proto());
+  validator.Execute(10);
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -21,7 +21,9 @@ namespace inference {
 namespace tensorrt {
 TEST(MulOpConverter, main) {
-  TRTConvertValidation validator(10, 1000);
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
  validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
  validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));

--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 namespace paddle {
 namespace inference {
@@ -27,7 +28,9 @@ TEST(OpConverter, ConvertBlock) {
  conv2d_op->SetType("conv2d");
  OpConverter converter;
-  converter.ConvertBlock(*block->Proto(), nullptr /*TensorRTEngine*/);
+  framework::Scope scope;
+  converter.ConvertBlock(*block->Proto(), {}, scope,
+                         nullptr /*TensorRTEngine*/);
 }
 }  // namespace tensorrt

--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -61,7 +61,10 @@ class TRTConvertValidation {
 public:
  TRTConvertValidation() = delete;
-  explicit TRTConvertValidation(int batch_size, int workspace_size = 1024) {
+  TRTConvertValidation(int batch_size,
+                       const std::unordered_set<std::string>& parameters,
+                       framework::Scope& scope, int workspace_size = 1 << 10)
+      : parameters_(parameters), scope_(scope) {
    // create engine.
    engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
    engine_->InitNetwork();
@@ -76,19 +79,22 @@ class TRTConvertValidation {
    engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
  }
+  // Declare a parameter varaible in the scope.
+  void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
  void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
    DeclVar(name, dims);
  }
+  // Declare a variable in a fluid Scope.
  void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
    platform::CPUPlace place;
    platform::CPUDeviceContext ctx(place);
    // Init Fluid tensor.
-    std::vector<int> dim_vec(dims.nbDims);
+    std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
-    for (int i = 0; i < dims.nbDims; i++) {
-      dim_vec[i] = dims.d[i];
-    }
    auto* x = scope_.Var(name);
    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
    x_tensor->Resize(framework::make_ddim(dim_vec));
@@ -99,20 +105,22 @@ class TRTConvertValidation {
    op_ = framework::OpRegistry::CreateOp(desc);
    OpConverter op_converter;
-    op_converter.ConvertOp(desc, engine_.get());
+    op_converter.ConvertOp(desc, parameters_, scope_, engine_.get());
    engine_->FreezeNetwork();
    // Declare outputs.
-    op_desc_.reset(new framework::OpDesc(desc, nullptr, nullptr));
+    op_desc_.reset(new framework::OpDesc(desc, nullptr));
    // Set Inputs.
    for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
      auto* var = scope_.FindVar(input);
      PADDLE_ENFORCE(var);
      auto tensor = var->GetMutable<framework::LoDTensor>();
      engine_->SetInputFromCPU(
-          input, static_cast<void*>(tensor->data<float>()),
+          input, static_cast<void*>(tensor->data<void>()),
          sizeof(float) *
              analysis::AccuDims(tensor->dims(), tensor->dims().size()));
    }
@@ -120,18 +128,21 @@ class TRTConvertValidation {
  void Execute(int batch_size) {
    // Execute Fluid Op
-    // Execute TRT
    platform::CPUPlace place;
    platform::CPUDeviceContext ctx(place);
-    engine_->Execute(batch_size);
    op_->Run(scope_, place);
+    // Execute TRT.
+    engine_->Execute(batch_size);
+    cudaStreamSynchronize(*engine_->stream());
    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+    const size_t output_space_size = 200;
    for (const auto& output : op_desc_->OutputArgumentNames()) {
      std::vector<float> fluid_out;
-      std::vector<float> trt_out(200);
+      std::vector<float> trt_out(output_space_size);
-      engine_->GetOutputInCPU(output, &trt_out[0], 200 * sizeof(float));
+      engine_->GetOutputInCPU(output, &trt_out[0],
+                              output_space_size * sizeof(float));
+      cudaStreamSynchronize(*engine_->stream());
      auto* var = scope_.FindVar(output);
      auto tensor = var->GetMutable<framework::LoDTensor>();
@@ -139,7 +150,7 @@ class TRTConvertValidation {
      // Compare two output
      ASSERT_FALSE(fluid_out.empty());
      for (size_t i = 0; i < fluid_out.size(); i++) {
-        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 0.001);
+        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6);
      }
    }
  }
@@ -149,9 +160,10 @@ class TRTConvertValidation {
 private:
  std::unique_ptr<TensorRTEngine> engine_;
  cudaStream_t stream_;
-  framework::Scope scope_;
  std::unique_ptr<framework::OperatorBase> op_;
  std::unique_ptr<framework::OpDesc> op_desc_;
+  const std::unordered_set<std::string>& parameters_;
+  framework::Scope& scope_;
 };
 }  // namespace tensorrt

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -106,6 +106,7 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
                    name);
  auto* output = layer->getOutput(offset);
+  SetITensor(name, output);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
  infer_network_->markOutput(*output);

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -37,13 +37,15 @@ class TensorRTEngine : public EngineBase {
  // Weight is model parameter.
  class Weight {
   public:
-    Weight(nvinfer1::DataType dtype, void* value, int num_elem) {
+    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
      w_.type = dtype;
      w_.values = value;
      w_.count = num_elem;
    }
    const nvinfer1::Weights& get() { return w_; }
+    std::vector<int64_t> dims;
   private:
    nvinfer1::Weights w_;
  };

--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -34,9 +34,22 @@ class BilinearInterpOp : public framework::OperatorWithKernel {
    int out_w = ctx->Attrs().Get<int>("out_w");
    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
+    if (ctx->HasInput("OutSize")) {
+      auto out_size_dim = ctx->GetInputDim("OutSize");
+      PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
+                        "OutSize's dimension size must be 1");
+      PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
+    }
    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
    ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -45,6 +58,10 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X",
             "(Tensor) The input tensor of bilinear interpolation, "
             "This is a 4-D tensor with shape of (N x C x h x w)");
+    AddInput("OutSize",
+             "(Tensor) This is a 1-D tensor with two number. "
+             "The first number is height and the second number is width.")
+        .AsDispensable();
    AddOutput("Out",
              "(Tensor) The dimension of output is (N x C x out_h x out_w]");
@@ -78,6 +95,12 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel {
      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
    }
  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/bilinear_interp_op.cu
+++ b/paddle/fluid/operators/bilinear_interp_op.cu
@@ -102,10 +102,21 @@ class BilinearInterpOpCUDAKernel : public framework::OpKernel<T> {
    auto* input_t = ctx.Input<Tensor>("X");      // float tensor
    auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
    auto* input = input_t->data<T>();
-    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
    int out_h = ctx.Attr<int>("out_h");
    int out_w = ctx.Attr<int>("out_w");
+    auto out_dims = output_t->dims();
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
    int batch_size = input_t->dims()[0];
    int channels = input_t->dims()[1];
    int in_h = input_t->dims()[2];
@@ -139,8 +150,8 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
    auto* d_output = d_output_t->data<T>();
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
    auto& device_ctx =
        ctx.template device_context<platform::CUDADeviceContext>();
@@ -149,6 +160,16 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
    int out_h = ctx.Attr<int>("out_h");
    int out_w = ctx.Attr<int>("out_w");
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
    int batch_size = d_input_t->dims()[0];
    int channels = d_input_t->dims()[1];
    int in_h = d_input_t->dims()[2];

--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -24,11 +24,18 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* input_t = ctx.Input<Tensor>("X");      // float tensor
    auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
+    auto out_dims = output_t->dims();
    auto* input = input_t->data<T>();
-    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
    int out_h = ctx.Attr<int>("out_h");
    int out_w = ctx.Attr<int>("out_w");
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
    int batch_size = input_t->dims()[0];
    int channels = input_t->dims()[1];
    int in_h = input_t->dims()[2];
@@ -83,9 +90,8 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
    auto* d_output = d_output_t->data<T>();
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
    auto& device_ctx =
        ctx.template device_context<platform::CPUDeviceContext>();
    math::SetConstant<platform::CPUDeviceContext, T> zero;
@@ -93,6 +99,14 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
    int out_h = ctx.Attr<int>("out_h");
    int out_w = ctx.Attr<int>("out_w");
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
    int batch_size = d_input_t->dims()[0];
    int channels = d_input_t->dims()[1];
    int in_h = d_input_t->dims()[2];

--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -33,7 +33,6 @@ class GatherOp : public framework::OperatorWithKernel {
    auto index_dims = ctx->GetInputDim("Index");
    PADDLE_ENFORCE(index_dims.size() == 1);
    int batch_size = ctx->GetInputDim("Index")[0];
-    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
    framework::DDim output_dims(ctx->GetInputDim("X"));
    output_dims[0] = batch_size;
    ctx->SetOutputDim("Out", output_dims);

--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+class ShapeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input (Input) of get_shape op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Out) of get_shape op should not be null.");
+    auto in_dim = ctx->GetInputDim("Input");
+    ctx->SetOutputDim("Out", {in_dim.size()});
+  }
+};
+class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "(Tensor), The input tensor.");
+    AddOutput("Out", "(Tensor), The shape of input tensor.");
+    AddComment(R"DOC(
+Shape Operator. 
+Get the shape of input tensor.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shape, ops::ShapeOp, ops::ShapeOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int64_t>,
+                       ops::ShapeKernel<float>, ops::ShapeKernel<double>);
--- a/paddle/fluid/operators/shape_op.cu
+++ b/paddle/fluid/operators/shape_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/shape_op.h"
+REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
+                        paddle::operators::ShapeKernel<int64_t>,
+                        paddle::operators::ShapeKernel<float>,
+                        paddle::operators::ShapeKernel<double>);
--- a/paddle/fluid/operators/shape_op.h
+++ b/paddle/fluid/operators/shape_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+class ShapeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_t = ctx.Input<Tensor>("Input");
+    auto* out_t = ctx.Output<Tensor>("Out");
+    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
+    auto in_dims = in_t->dims();
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -31,8 +31,9 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
  auto max_workspace = context.Attr<int>("max_workspace");
  engine_.reset(new inference::tensorrt::TensorRTEngine(
      max_batch_, max_workspace, nullptr));
+  // TODO(Superjomn) parameters should be passed after analysised from outside.
  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block, engine_.get());
+      block, {}, context.scope(), engine_.get());
  engine_->FreezeNetwork();
 }

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -127,6 +127,7 @@ double Event::CpuElapsedMs(const Event& e) const {
 double Event::CudaElapsedMs(const Event& e) const {
 #ifdef PADDLE_WITH_CUDA
+  if (!has_cuda_) return 0.0;
  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
  PADDLE_ENFORCE(e.device() == device());
  PADDLE_ENFORCE(cudaEventSynchronize(event_));

--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
    """
    Create a batched reader.
@@ -23,6 +23,8 @@ def batch(reader, batch_size):
    :type reader: callable
    :param batch_size: size of each mini-batch
    :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
    :return: the batched reader.
    :rtype: callable
    """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
            if len(b) == batch_size:
                yield b
                b = []
-        if b:
+        if drop_last == False and len(b) != 0:
            yield b
    return batch_reader
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -82,6 +82,7 @@ __all__ = [
    'roi_pool',
    'dice_loss',
    'upsampling_bilinear2d',
+    'gather',
    'random_crop',
 ]
@@ -3889,7 +3890,6 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
 def dice_loss(input, label, epsilon=0.00001):
    """
-    **Dice loss Layer**
    Dice loss for comparing the similarity of two batch of data,
    usually is used for binary image segmentation i.e. labels are binary.
    The dice loss can be defined as below equation:
@@ -3944,7 +3944,7 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
        input (Variable): The input tensor of bilinear interpolation,
                          This is a 4-D tensor of the shape
                          (num_batches, channels, in_h, in_w).
-        out_shape(list|tuple|None): Output shape of bilinear interpolation
+        out_shape(list|tuple|Variable|None): Output shape of bilinear interpolation
                                    layer, the shape is (out_h, out_w).
                                    Default: None
        scale(int|None): The multiplier for the input height or width.
@@ -3971,13 +3971,20 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
    def _is_list_or_turple_(data):
        return (isinstance(data, list) or isinstance(data, tuple))
+    out_h = 0
+    out_w = 0
+    inputs = {"X": input}
    if out_shape is not None:
-        if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2):
+        if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2) and (
+                out_shape is not Variable):
            raise ValueError('out_shape should be a list or tuple ',
                             'with length 2, (out_h, out_w).')
-        out_shape = list(map(int, out_shape))
+        if _is_list_or_turple_(out_shape):
-        out_h = out_shape[0]
+            out_shape = list(map(int, out_shape))
-        out_w = out_shape[1]
+            out_h = out_shape[0]
+            out_w = out_shape[1]
+        else:
+            inputs['OutSize'] = out_shape
    else:
        out_h = int(input.shape[2] * scale)
        out_w = int(input.shape[3] * scale)
@@ -3985,13 +3992,62 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
    out = helper.create_tmp_variable(dtype)
    helper.append_op(
        type="bilinear_interp",
-        inputs={"X": input},
+        inputs=inputs,
        outputs={"Out": out},
        attrs={"out_h": out_h,
               "out_w": out_w})
    return out
+def gather(input, index):
+    """
+    Output is obtained by gathering entries of the outer-most dimension 
+    of X indexed by `index` and concatenate them together.
+    .. math::
+	Out = X[Index]
+    .. code-block:: text
+                Given:
+    		X = [[1, 2],
+         	     [3, 4],
+                     [5, 6]]
+                Index = [1, 2]
+                Then:
+                Out = [[3, 4],
+                       [5, 6]]
+    Args:
+        input (Variable): The source input with rank>=1. 
+        index (Variable): The index input with rank=1.
+    Returns:
+        output (Variable): The output is a tensor with the same rank as input.
+    Examples:
+        .. code-block:: python
+            output = fluid.layers.gather(x, index)
+    """
+    helper = LayerHelper('gather', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="gather",
+        inputs={"X": input,
+                "Index": index},
+        outputs={"Out": out})
+    return out
 def random_crop(input, shape, seed=1):
    helper = LayerHelper("random_crop", **locals())
    dtype = helper.input_dtype()

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -71,6 +71,7 @@ __all__ = [
    'cumsum',
    'scatter',
    'sum',
+    'shape',
 ] + __activations__
 for _OP in set(__all__):

--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -17,7 +17,10 @@ import numpy as np
 from op_test import OpTest
-def bilinear_interp_np(input, out_h, out_w):
+def bilinear_interp_np(input, out_h, out_w, out_size):
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
    batch_size, channel, in_h, in_w = input.shape
    if out_h > 1:
        ratio_h = (in_h - 1.0) / (out_h - 1.0)
@@ -49,12 +52,15 @@ def bilinear_interp_np(input, out_h, out_w):
 class TestBilinearInterpOp(OpTest):
    def setUp(self):
+        self.out_size = None
        self.init_test_case()
        self.op_type = "bilinear_interp"
        input_np = np.random.random(self.input_shape).astype("float32")
-        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w)
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size)
        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
        self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
        self.outputs = {'Out': output_np}
@@ -68,6 +74,7 @@ class TestBilinearInterpOp(OpTest):
        self.input_shape = [2, 3, 4, 4]
        self.out_h = 2
        self.out_w = 2
+        self.out_size = np.array([3, 3]).astype("int32")
 class TestCase1(TestBilinearInterpOp):
@@ -91,5 +98,29 @@ class TestCase3(TestBilinearInterpOp):
        self.out_w = 128
+class TestCase4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.out_size = np.array([2, 2]).astype("int32")
+class TestCase5(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([11, 11]).astype("int32")
+class TestCase6(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.out_size = np.array([65, 129]).astype("int32")
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -20,8 +20,9 @@ from op_test import OpTest
 class TestGatherOp(OpTest):
    def setUp(self):
        self.op_type = "gather"
-        xnp = np.random.random((10, 20)).astype("float32")
+        self.config()
-        self.inputs = {'X': xnp, 'Index': np.array([1, 3, 5]).astype("int32")}
+        xnp = np.random.random(self.x_shape).astype("float32")
+        self.inputs = {'X': xnp, 'Index': np.array(self.index).astype("int32")}
        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
    def test_check_output(self):
@@ -30,6 +31,16 @@ class TestGatherOp(OpTest):
    def test_check_grad(self):
        self.check_grad(['X'], 'Out')
+    def config(self):
+        self.x_shape = (10, 20)
+        self.index = [1, 3, 5]
+class TestCase1(TestGatherOp):
+    def config(self):
+        self.x_shape = (10)
+        self.index = [1, 3, 5]
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_shape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestShapeOp(OpTest):
+    def setUp(self):
+        self.op_type = "shape"
+        self.config()
+        self.shape = [2, 3]
+        input = np.zeros(self.shape)
+        self.inputs = {'Input': input}
+        self.outputs = {'Out': np.array(self.shape)}
+    def config(self):
+        self.shape = [2, 3]
+    def test_check_output(self):
+        self.check_output()
+class case1(TestShapeOp):
+    def config(self):
+        self.shape = [2]
+class case2(TestShapeOp):
+    def config(self):
+        self.shape = [1, 2, 3]
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
    """
    Create a batched reader.
@@ -23,6 +23,8 @@ def batch(reader, batch_size):
    :type reader: callable
    :param batch_size: size of each mini-batch
    :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
    :return: the batched reader.
    :rtype: callable
    """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
            if len(b) == batch_size:
                yield b
                b = []
-        if b:
+        if drop_last == False and len(b) != 0:
            yield b
    return batch_reader