Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-communicator

c7e38680 · Qiao Longfei · 1edc0423 · a6910f90 · c7e38680 · c7e38680
138 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -276,9 +276,3 @@ add_subdirectory(paddle)
 if(WITH_PYTHON)
    add_subdirectory(python)
 endif()
-if(WITH_DOC)
-    find_package(Sphinx REQUIRED)
-    find_python_module(recommonmark REQUIRED)
-    add_subdirectory(doc)
-endif()
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 # ENV variables
 ARG WITH_GPU
 ARG WITH_AVX
-ARG WITH_DOC
 ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV HOME /root
 # Add bash enhancements

--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
-# - This module looks for Sphinx
-# Find the Sphinx documentation generator
-#
-# This modules defines
-#  SPHINX_EXECUTABLE
-#  SPHINX_FOUND
-find_program(SPHINX_EXECUTABLE
-  NAMES sphinx-build
-  PATHS
-    /usr/bin
-    /usr/local/bin
-    /opt/local/bin
-  DOC "Sphinx documentation generator"
-)
-if( NOT SPHINX_EXECUTABLE )
-  set(_Python_VERSIONS
-    2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5
-  )
-  foreach( _version ${_Python_VERSIONS} )
-    set( _sphinx_NAMES sphinx-build-${_version} )
-    find_program( SPHINX_EXECUTABLE
-      NAMES ${_sphinx_NAMES}
-      PATHS
-        /usr/bin
-        /usr/local/bin
-        /opt/loca/bin
-      DOC "Sphinx documentation generator"
-    )
-  endforeach()
-endif()
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(Sphinx DEFAULT_MSG
-  SPHINX_EXECUTABLE
-)
-option( SPHINX_HTML_OUTPUT "Build a single HTML with the whole content." ON )
-option( SPHINX_DIRHTML_OUTPUT "Build HTML pages, but with a single directory per document." OFF )
-option( SPHINX_HTMLHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in htmlhelp." OFF )
-option( SPHINX_QTHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in qthelp." OFF )
-option( SPHINX_DEVHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in devhelp." OFF )
-option( SPHINX_EPUB_OUTPUT "Build HTML pages with additional information for building a documentation collection in epub." OFF )
-option( SPHINX_LATEX_OUTPUT "Build LaTeX sources that can be compiled to a PDF document using pdflatex." OFF )
-option( SPHINX_MAN_OUTPUT "Build manual pages in groff format for UNIX systems." OFF )
-option( SPHINX_TEXT_OUTPUT "Build plain text files." OFF )
-mark_as_advanced(
-  SPHINX_EXECUTABLE
-  SPHINX_HTML_OUTPUT
-  SPHINX_DIRHTML_OUTPUT
-  SPHINX_HTMLHELP_OUTPUT
-  SPHINX_QTHELP_OUTPUT
-  SPHINX_DEVHELP_OUTPUT
-  SPHINX_EPUB_OUTPUT
-  SPHINX_LATEX_OUTPUT
-  SPHINX_MAN_OUTPUT
-  SPHINX_TEXT_OUTPUT
-)
-function( Sphinx_add_target target_name builder conf cache source destination )
-  add_custom_target( ${target_name} ALL
-    COMMAND ${SPHINX_EXECUTABLE} -b ${builder}
-    -d ${cache}
-    -c ${conf}
-    ${source}
-    ${destination}
-    COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND cd ${destination} && ln -sf ./index_*.html index.html
-    )
-  set_property(
-    DIRECTORY APPEND PROPERTY
-    ADDITIONAL_MAKE_CLEAN_FILES
-    ${destination}
-    )
-endfunction()
-# Target dependencies can be optionally listed at the end.
-function( Sphinx_add_targets target_base_name conf source base_destination )
-  set( _dependencies )
-  foreach( arg IN LISTS ARGN )
-    set( _dependencies ${_dependencies} ${arg} )
-  endforeach()
-  if( ${SPHINX_HTML_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_html html ${conf} ${source} ${base_destination}/html )
-    add_dependencies( ${target_base_name}_html ${_dependencies} )
-  endif()
-  if( ${SPHINX_DIRHTML_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_dirhtml dirhtml ${conf} ${source} ${base_destination}/dirhtml )
-    add_dependencies( ${target_base_name}_dirhtml ${_dependencies} )
-  endif()
-  if( ${SPHINX_QTHELP_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_qthelp qthelp ${conf} ${source} ${base_destination}/qthelp )
-    add_dependencies( ${target_base_name}_qthelp ${_dependencies} )
-  endif()
-  if( ${SPHINX_DEVHELP_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_devhelp devhelp ${conf} ${source} ${base_destination}/devhelp )
-    add_dependencies( ${target_base_name}_devhelp ${_dependencies} )
-  endif()
-  if( ${SPHINX_EPUB_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_epub epub ${conf} ${source} ${base_destination}/epub )
-    add_dependencies( ${target_base_name}_epub ${_dependencies} )
-  endif()
-  if( ${SPHINX_LATEX_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_latex latex ${conf} ${source} ${base_destination}/latex )
-    add_dependencies( ${target_base_name}_latex ${_dependencies} )
-  endif()
-  if( ${SPHINX_MAN_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_man man ${conf} ${source} ${base_destination}/man )
-    add_dependencies( ${target_base_name}_man ${_dependencies} )
-  endif()
-  if( ${SPHINX_TEXT_OUTPUT} )
-    Sphinx_add_target( ${target_base_name}_text text ${conf} ${source} ${base_destination}/text )
-    add_dependencies( ${target_base_name}_text ${_dependencies} )
-  endif()
-  if( ${BUILD_TESTING} )
-    sphinx_add_target( ${target_base_name}_linkcheck linkcheck ${conf} ${source} ${base_destination}/linkcheck )
-    add_dependencies( ${target_base_name}_linkcheck ${_dependencies} )
-  endif()
-endfunction()
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -388,6 +388,7 @@ function(cc_test TARGET_NAME)
    endif()
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
    # No unit test should exceed 10 minutes.
    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
@@ -460,6 +461,7 @@ function(nv_test TARGET_NAME)
    endif()
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
  endif()
 endfunction(nv_test)
@@ -708,9 +710,10 @@ function(py_test TARGET_NAME)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
             COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-             FLAGS_cpu_deterministic=true
+             FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296  # 4G
             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
 paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
-paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
@@ -213,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
 paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
+paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
@@ -359,6 +360,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
 paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
 paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
 #windows treat symbolic file as a real file, which is different with unix
 #We create a hidden file and compile it instead of origin source file.
 function(windows_symbolic TARGET)
@@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
-if(WITH_NGRAPH)
-  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
-  cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-             shape_inference data_transform lod_tensor profiler)
-endif(WITH_NGRAPH)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
@@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  if(WITH_NGRAPH)
+  if (WITH_NGRAPH)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine)
-  else(WITH_NGRAPH)
+  else ()
    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-  endif(WITH_NGRAPH)
+  endif()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
@@ -214,3 +206,24 @@ endif (NOT WIN32)
 cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
 cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
+# Get the current working branch
+execute_process(
+  COMMAND git rev-parse --abbrev-ref HEAD
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_BRANCH
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+# Get the latest abbreviated commit hash of the working branch
+execute_process(
+  COMMAND git log -1 --format=%h
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_COMMIT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+message(STATUS "commit: ${PADDLE_COMMIT}")
+message(STATUS "branch: ${PADDLE_BRANCH}")
+configure_file(commit.h.in commit.h)
--- a/paddle/fluid/framework/commit.h.in
+++ b/paddle/fluid/framework/commit.h.in
+#pragma once
+#include <string>
+namespace paddle {
+namespace framework {
+static std::string paddle_commit() {
+  return "@PADDLE_COMMIT@";
+}
+static std::string paddle_compile_branch() {
+  return "@PADDLE_BRANCH@";
+}
+static std::string paddle_version() {
+  return "@PADDLE_VERSION@";
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -91,7 +91,7 @@ struct BuildStrategy {
  int num_trainers_{1};
  int trainer_id_{0};
  std::vector<std::string> trainers_endpoints_;
-  bool remove_unnecessary_lock_{false};
+  bool remove_unnecessary_lock_{true};
  // NOTE:
  // Before you add new options, think if it's a general strategy that works

--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -25,6 +25,9 @@ struct ExecutionStrategy {
  size_t num_threads_{0};
  bool use_cuda_{true};
  bool allow_op_delay_{false};
+  // If we set this to 1, we will delete all variables when finish a batch. and
+  // this will loss 15%+ performance.
+  // Please be aware about this parameters.
  size_t num_iteration_per_drop_scope_{1};
  ExecutorType type_{kDefault};
  bool dry_run_{false};

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #ifdef PADDLE_WITH_NGRAPH
-#include "paddle/fluid/framework/ngraph_operator.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
 #endif
 DECLARE_bool(benchmark);
@@ -133,24 +133,6 @@ static void DeleteUnusedTensors(
  }
 }
-static void EnableFusedOp(ExecutorPrepareContext* ctx) {
-#ifdef PADDLE_WITH_NGRAPH
-  VLOG(3) << "use_ngraph=True";
-  auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_);
-  for (auto& interval : intervals) {
-    auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0),
-                                     interval.at(1));
-    *interval[0] = std::unique_ptr<OperatorBase>(ng_op);
-  }
-  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
-    ctx->ops_.erase(it->at(0) + 1, it->at(1));
-  }
-#else
-  LOG(WARNING)
-      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
-#endif
-}
 Executor::Executor(const platform::Place& place) : place_(place) {}
 void Executor::Close() {
@@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool create_local_scope, bool create_vars) {
  platform::RecordBlock b(block_id);
  if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
+#ifdef PADDLE_WITH_NGRAPH
+  if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
+#endif
  auto ctx = Prepare(pdesc, block_id);
  RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
@@ -379,7 +364,6 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
  }
-  if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
  return ctx;
 }

--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include <set>
 #include <vector>
 namespace paddle {
@@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
  }
  std::unordered_set<Node *> visited;
-  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+  std::set<Node *> to_visit{source.begin(), source.end()};
  std::vector<Node *> inlink_visited;
  while (!to_visit.empty()) {

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
  if (!platform::is_cpu_place(t.place())) {
-    LoDTensor tt;
+    LoDTensor cpu_tensor;
-    framework::TensorCopy(t, platform::CPUPlace(), &tt);
+    cpu_tensor.set_lod(t.lod());
+    framework::TensorCopy(t, platform::CPUPlace(), &cpu_tensor);
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(t.place());
    dev_ctx.Wait();
-    os << tt;
+    os << cpu_tensor;
    return os;
  }

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #pragma once

--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <algorithm>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/variant.h"
-#include "ngraph/type/element_type.hpp"
-namespace paddle {
-namespace framework {
-class NgraphOperator : public OperatorBase {
- public:
-  static std::vector<
-      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-  NgraphOpIntervals(
-      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
-  explicit NgraphOperator(
-      const ProgramDesc& prog, size_t block_id,
-      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
-      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
-      const std::string& type = "fused_op", const VariableNameMap& inputs = {},
-      const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
-  void RunImpl(const Scope& scope, const platform::Place& place) const final;
- private:
-  const ProgramDesc pdesc_;
-  size_t block_;
-  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
-  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::unordered_set<std::string> persistables_;
-  std::unordered_set<std::string> fetches_;
-  std::unordered_set<std::string> post_op_inputs_;
-  bool is_full_ = false;
-  void Process();
-};
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -1075,7 +1073,9 @@ Scope* OperatorWithKernel::PrepareData(
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
    const ExecutionContext& ctx) const {
-  int data_type = -1;
+  proto::VarType::Type dafault_data_type =
+      static_cast<proto::VarType::Type>(-1);
+  proto::VarType::Type data_type = dafault_data_type;
  for (auto& input : this->inputs_) {
    const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
    for (size_t i = 0; i < vars.size(); ++i) {
@@ -1092,18 +1092,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
        if (t != nullptr) {
          PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
                         input.first, i);
-          int tmp = static_cast<int>(t->type());
+          proto::VarType::Type tmp = t->type();
          PADDLE_ENFORCE(
-              tmp == data_type || data_type == -1,
+              tmp == data_type || data_type == dafault_data_type,
              "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
-              Type(), data_type, tmp);
+              Type(), DataTypeToString(data_type), DataTypeToString(tmp));
          data_type = tmp;
        }
      }
    }
  }
-  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+  PADDLE_ENFORCE(data_type != dafault_data_type,
-  return static_cast<proto::VarType::Type>(data_type);
+                 "DataType should be indicated by input");
+  return data_type;
 }
 OpKernelType OperatorWithKernel::GetExpectedKernelType(

--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -25,7 +25,8 @@ inline const T* Tensor::data() const {
  check_memory_size();
  bool valid =
      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType;
-  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_);
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d",
+                 DataTypeToString(type_));
  return reinterpret_cast<const T*>(
      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);

--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
 if(WITH_PYTHON)
-cc_library(layer SRCS layer.cc DEPS proto_desc operator)
+cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas)
-cc_library(tracer SRCS tracer.cc DEPS proto_desc)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context)
 cc_library(engine SRCS engine.cc)
 endif()
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/imperative/layer.h"
 #include <deque>
 #include <limits>
 #include <map>
@@ -22,6 +23,9 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/string/printf.h"
 namespace paddle {
@@ -34,22 +38,66 @@ std::map<int, py::object> py_funcs_;
 using framework::Variable;
-void AddTo(Variable* src, Variable* dst) {
+namespace detail {
-  framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
-  framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+template <typename T>
+class TensorAddToFunctor : public boost::static_visitor<> {
+ public:
+  TensorAddToFunctor(int64_t numel, const T* x, T* y)
+      : numel_(numel), x_(x), y_(y) {}
+  void operator()(const platform::CPUPlace& place) {
+    platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+#ifdef PADDLE_WITH_CUDA
+  void operator()(const platform::CUDAPlace& place) {
+    platform::CUDADeviceContext* ctx =
+        dynamic_cast<platform::CUDADeviceContext*>(
+            platform::DeviceContextPool::Instance().Get(place));
+    auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+#else
+  void operator()(const platform::CUDAPlace& place) {
+    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+  }
+#endif
+  // there is NO blas in CUDAPinnedPlace
+  void operator()(const platform::CUDAPinnedPlace& place) {
+    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+  }
+ private:
+  int64_t numel_;
+  const T* x_;
+  T* y_;
+};
+}  // namespace detail
+void AddTo(Variable* src, Variable* dst, platform::Place place) {
+  framework::Tensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
+  framework::Tensor* src_tensor = src->GetMutable<framework::LoDTensor>();
  // FIXME(minqiyang): loss_grad op will pass a zero grad of label
  // ugly fix for it
  if (src_tensor->numel() == 0) {
    return;
  }
  PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
                 "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
                 src_tensor->numel());
-  float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
-  const float* src_data = src_tensor->data<float>();
+  detail::TensorAddToFunctor<float> func(
-  for (int64_t i = 0; i < src_tensor->numel(); ++i) {
+      src_tensor->numel(), src_tensor->data<float>(),
-    dst_data[i] += src_data[i];
+      dst_tensor->mutable_data<float>(place));
-  }
+  boost::apply_visitor(func, place);
 }
 class Autograd {
@@ -120,66 +168,104 @@ class Autograd {
  }
 };
+std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
+                                             const bool blocking) const {
+  PADDLE_ENFORCE(var_->IsInitialized(),
+                 "Variable must be initialized when getting numpy tensor");
+  std::unique_ptr<VarBase> new_var(new VarBase());
+  framework::LoDTensor* tensor =
+      new_var->var_->GetMutable<framework::LoDTensor>();
+  tensor->Resize(var_->Get<framework::LoDTensor>().dims());
+  tensor->set_lod(var_->Get<framework::LoDTensor>().lod());
+  if (blocking) {
+    platform::DeviceContext* dev_ctx =
+        platform::DeviceContextPool::Instance().Get(dst_place);
+    framework::TensorCopySync(var_->Get<framework::LoDTensor>(), dst_place,
+                              tensor);
+    dev_ctx->Wait();
+  } else {
+    framework::TensorCopy(var_->Get<framework::LoDTensor>(), dst_place, tensor);
+  }
+  if (platform::is_gpu_place(dst_place)) {
+    VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu";
+  }
+  return new_var;
+}
 framework::LoDTensor& VarBase::GradValue() {
  VLOG(3) << "get var grad " << var_desc_->Name();
  return *(grads_->var_->GetMutable<framework::LoDTensor>());
 }
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
-  if (!grad_op_desc_ && backward_id_ <= 0) {
+  if (grad_op_descs_.empty() && backward_id_ <= 0) {
    LOG(WARNING) << "op with no grad: " << op_desc_->Type();
    return {};
  }
-  std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
+  std::vector<framework::VariableValueMap> grad_outputs;
  if (backward_id_ > 0) {
    VLOG(3) << "py_layer_grad";
-    grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad(
+    grad_outputs.resize(1);
-        backward_id_,
+    grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
-        grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]);
+        PyLayer::ApplyGrad(
+            backward_id_,
+            grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]);
  } else {
-    VLOG(3) << "op grad " << grad_op_desc_->Type();
+    grad_outputs.resize(grad_op_descs_.size());
-    for (auto it : grad_output_vars_) {
+    for (size_t k = 0; k < grad_op_descs_.size(); ++k) {
-      auto& outputs = grad_outputs[it.first];
+      framework::OpDesc* grad_op_desc = grad_op_descs_[k];
-      for (size_t i = 0; i < it.second.size(); ++i) {
+      VLOG(3) << "op grad " << grad_op_desc->Type();
-        // Allocate a new variable
+      for (auto it : grad_output_vars_[k]) {
-        Variable* tmp_var = new framework::Variable();
+        auto& outputs = grad_outputs[k][it.first];
-        tmp_var->GetMutable<framework::LoDTensor>();
+        for (size_t i = 0; i < it.second.size(); ++i) {
-        outputs.push_back(tmp_var);
+          // Allocate a new variable
+          Variable* tmp_var = new framework::Variable();
+          tmp_var->GetMutable<framework::LoDTensor>();
+          outputs.push_back(tmp_var);
+        }
      }
-    }
-    framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
+      framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]);
-    // No need to do compile time infer shape here.
+      // No need to do compile time infer shape here.
-    // grad_op_desc_->InferShape(*block_);
+      // grad_op_desc_->InferShape(*block_);
-    grad_op_desc_->InferVarType(block_);
+      grad_op_desc->InferVarType(block_);
-    std::unique_ptr<framework::OperatorBase> opbase =
+      std::unique_ptr<framework::OperatorBase> opbase =
-        framework::OpRegistry::CreateOp(*grad_op_desc_);
+          framework::OpRegistry::CreateOp(*grad_op_desc);
-    framework::OperatorWithKernel* op_kernel =
+      framework::OperatorWithKernel* op_kernel =
-        dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
+          dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
-    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+      PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
-    framework::Scope scope;
+      framework::Scope scope;
-    platform::CPUPlace place;
+      PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
-    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+      p.op.RuntimeInferShape(scope, place_, ctx);
-    p.op.RuntimeInferShape(scope, place, ctx);
+      p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
-    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+    }
  }
-  for (auto it : grad_output_vars_) {
+  for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
-    auto& outputs = grad_outputs[it.first];
+    for (auto it : grad_output_vars_[k]) {
-    auto& origin_outputs = it.second;
+      auto& outputs = grad_outputs[k][it.first];
-    PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
+      auto& origin_outputs = it.second;
+      PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      framework::Variable* grad = outputs[i];
+      for (size_t i = 0; i < outputs.size(); ++i) {
-      framework::Variable* orig_grad = origin_outputs[i];
+        framework::Variable* grad = outputs[i];
-      AddTo(grad, orig_grad);
+        framework::Variable* orig_grad = origin_outputs[i];
-      delete grad;
+        AddTo(grad, orig_grad, place_);
+        delete grad;
+      }
    }
  }
  return input_vars_;
 }
@@ -188,8 +274,10 @@ void VarBase::RunBackward() {
  VLOG(3) << "start backward";
  auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
-  float* data = grads_t->mutable_data<float>(platform::CPUPlace());
+  operators::math::set_constant(
-  std::fill(data, data + grads_t->numel(), 1.0);
+      *(platform::DeviceContextPool::Instance().Get(
+          var_->GetMutable<framework::LoDTensor>()->place())),
+      grads_t, 1.0);
  PADDLE_ENFORCE(
      grads_ ==

--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -21,17 +21,21 @@
 #include <map>     // NOLINT
 #include <string>  // NOLINT
 #include <vector>  // NOLINT
+#include <memory>  // NOLINT
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/imperative/type_defs.h"
 namespace paddle {
 namespace imperative {
+class VarBase;
 namespace py = ::pybind11;
 class PreparedOp {
@@ -81,6 +85,8 @@ class PreparedOp {
    return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
  }
+  inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; }
  const framework::OperatorBase& op;
  const framework::RuntimeContext& ctx;
  framework::OperatorWithKernel::OpKernelFunc func;
@@ -148,6 +154,9 @@ class VarBase {
  framework::LoDTensor& GradValue();
+  std::unique_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
+                                      const bool blocking) const;
  inline std::string GradName() const {
    PADDLE_ENFORCE(
        var_desc_,
@@ -175,11 +184,13 @@ class OpBase {
  OpBase()
      : op_desc_(nullptr),
        forward_id_(-1),
-        grad_op_desc_(nullptr),
+        backward_id_(-1),
-        backward_id_(-1) {}
+        place_(platform::CPUPlace()) {}
  virtual ~OpBase() {
-    if (grad_op_desc_) delete grad_op_desc_;
+    for (framework::OpDesc* desc : grad_op_descs_) {
+      delete desc;
+    }
  }
  std::map<std::string, std::vector<VarBase*>> ApplyGrad();
@@ -188,18 +199,25 @@ class OpBase {
  // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
  framework::OpDesc* op_desc_;
  int forward_id_;
-  // When has backward, one of `grad_op_desc_` or `backward_id_` is set,
+  // When has backward, one of `grad_op_descs_` or `backward_id_` is set,
  // not both.
-  framework::OpDesc* grad_op_desc_;
+  // Note: each fwd op corresponds to a vector of bwd ops.
+  std::vector<framework::OpDesc*> grad_op_descs_;
  int backward_id_;
+  platform::Place place_;
  VarBasePtrMap input_vars_;
  VarBasePtrMap output_vars_;
  OpBasePtrMap pre_ops_;
  std::map<std::string, std::vector<int>> pre_ops_out_idx_;
-  framework::VariableValueMap grad_input_vars_;
+  // Inputs to a vector of bwd ops.
-  framework::VariableValueMap grad_output_vars_;
+  std::vector<framework::VariableValueMap> grad_input_vars_;
+  // Outputs to a vector of bwd ops.
+  std::vector<framework::VariableValueMap> grad_output_vars_;
  framework::BlockDesc* block_;
 };

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -14,33 +14,60 @@
 #include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace imperative {
 void CreateGradOp(const framework::OpDesc& op_desc,
                  const std::unordered_set<std::string>& no_grad_set,
                  const std::vector<framework::BlockDesc*>& grad_sub_block,
-                  framework::OpDesc** grad_op_desc,
+                  std::vector<framework::OpDesc*>* grad_op_descs,
                  std::unordered_map<std::string, std::string>* grad_to_var) {
-  std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+  PADDLE_ENFORCE(grad_op_descs->empty());
+  std::vector<std::unique_ptr<framework::OpDesc>> descs =
      framework::OpInfoMap::Instance()
          .Get(op_desc.Type())
          .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
-  PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
+  for (auto& desc : descs) {
-  // TODO(panyx0718): Leak?
+    grad_op_descs->emplace_back(desc.release());
-  *grad_op_desc = grad_op_descs[0].release();
+  }
 }
-void InitVar(framework::Variable* var, framework::Variable* grad_var) {
+void InitVar(framework::Variable* var, framework::Variable* grad_var,
+             platform::DeviceContext* dev_ctx) {
+  PADDLE_ENFORCE_NOT_NULL(dev_ctx,
+                          "Could not get valid device from forward op");
  auto& var_t = var->Get<framework::LoDTensor>();
-  float* data =
+  grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
-      grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
+      var_t.dims(), dev_ctx->GetPlace());
-          var_t.dims(), platform::CPUPlace());
+  operators::math::set_constant(
-  std::fill(data, data + var_t.numel(), 0.0);
+      *dev_ctx, grad_var->GetMutable<framework::LoDTensor>(), 0.0);
+}
+platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
+  platform::Place result = place;
+  for (auto it : inputs) {
+    for (VarBase* var : it.second) {
+      platform::Place tmp_place =
+          var->var_->Get<framework::LoDTensor>().place();
+      if (!platform::is_same_place(tmp_place, result)) {
+        PADDLE_THROW(
+            "Input variable should keep in the same place: %s, but get place: "
+            "%s of input %s instead",
+            result, tmp_place, it.first);
+      }
+    }
+  }
+  return result;
 }
 void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                   const VarBasePtrMap& outputs, framework::BlockDesc* block,
+                   const platform::Place expected_place,
                   const bool stop_gradient) {
  std::map<std::string, VarBase*> vars;
@@ -105,51 +132,59 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
  PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
  framework::Scope scope;
-  platform::CPUPlace place;
+  op->place_ = GetExpectedPlace(expected_place, inputs);
-  PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+  PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
-  p.op.RuntimeInferShape(scope, place, ctx);
+  prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
-  p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+  prepared_op.func(framework::ExecutionContext(
+      prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx));
  if (!stop_gradient) {
-    framework::OpDesc* grad_op_desc;
-    // TODO(panyx): Is this leaked?
    std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
        new std::unordered_map<std::string, std::string>());
-    CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get());
+    CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get());
-    op->grad_op_desc_ = grad_op_desc;
+    op->grad_input_vars_.resize(op->grad_op_descs_.size());
-    for (auto it : grad_op_desc->Inputs()) {
+    op->grad_output_vars_.resize(op->grad_op_descs_.size());
-      auto& grad_in_vars = op->grad_input_vars_[it.first];
+    for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) {
-      for (const std::string& grad_invar : it.second) {
+      framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
-        block->FindRecursiveOrCreateVar(grad_invar);
+      for (auto it : grad_op_desc->Inputs()) {
-        auto var_it = grad_to_var->find(grad_invar);
+        auto& grad_in_vars = op->grad_input_vars_[i][it.first];
-        if (var_it == grad_to_var->end()) {
+        for (const std::string& grad_invar : it.second) {
-          auto fwd_var_it = vars.find(grad_invar);
+          block->FindRecursiveOrCreateVar(grad_invar);
-          PADDLE_ENFORCE(fwd_var_it != vars.end());
+          auto var_it = grad_to_var->find(grad_invar);
-          // Forward inputs or outputs.
+          if (var_it == grad_to_var->end()) {
-          grad_in_vars.push_back(fwd_var_it->second->var_);
+            auto fwd_var_it = vars.find(grad_invar);
-        } else {
+            PADDLE_ENFORCE(fwd_var_it != vars.end());
-          VarBase* var = vars[var_it->second];
+            // Forward inputs or outputs.
-          if (!var->grads_->var_->IsInitialized()) {
+            grad_in_vars.push_back(fwd_var_it->second->var_);
-            InitVar(var->var_, var->grads_->var_);
+          } else {
+            VarBase* var = vars[var_it->second];
+            if (!var->grads_->var_->IsInitialized()) {
+              InitVar(var->var_, var->grads_->var_,
+                      prepared_op.GetDeviceContext());
+            }
+            // Douts.
+            grad_in_vars.push_back(var->grads_->var_);
          }
-          // Douts.
-          grad_in_vars.push_back(var->grads_->var_);
        }
      }
-    }
-    for (auto it : grad_op_desc->Outputs()) {
+      for (auto it : grad_op_desc->Outputs()) {
-      auto& grad_out_vars = op->grad_output_vars_[it.first];
+        auto& grad_out_vars = op->grad_output_vars_[i][it.first];
-      for (const std::string& grad_outvar : it.second) {
+        for (const std::string& grad_outvar : it.second) {
-        block->FindRecursiveOrCreateVar(grad_outvar);
+          block->FindRecursiveOrCreateVar(grad_outvar);
-        auto var_it = grad_to_var->find(grad_outvar);
+          auto var_it = grad_to_var->find(grad_outvar);
-        PADDLE_ENFORCE(var_it != grad_to_var->end());
+          PADDLE_ENFORCE(var_it != grad_to_var->end(),
-        VarBase* var = vars[var_it->second];
+                         "Could not found the grad op output var, should this "
-        if (!var->grads_->var_->IsInitialized()) {
+                         "operator %s's stop gradient be True",
-          InitVar(var->var_, var->grads_->var_);
+                         op_desc->Type());
+          VarBase* var = vars[var_it->second];
+          if (!var->grads_->var_->IsInitialized()) {
+            InitVar(var->var_, var->grads_->var_,
+                    prepared_op.GetDeviceContext());
+          }
+          grad_out_vars.push_back(var->grads_->var_);
        }
-        grad_out_vars.push_back(var->grads_->var_);
      }
    }
  }
@@ -178,10 +213,12 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
    out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient);
  }
  if (!stop_gradient) {
+    op->grad_input_vars_.resize(1);
+    op->grad_output_vars_.resize(1);
    auto& grad_input_vars =
-        op->grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)];
+        op->grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)];
    auto& grad_output_vars =
-        op->grad_output_vars_[framework::GradVarName(PyLayer::kFwdOut)];
+        op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
    for (const VarBase* inp : inputs) {
      grad_input_vars.push_back(inp->var_);
@@ -189,16 +226,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
    for (VarBase* out : outputs) {
      grad_input_vars.push_back(out->var_);
    }
+    platform::CPUPlace place;
    for (VarBase* out : outputs) {
      grad_input_vars.push_back(out->grads_->var_);
      if (!grad_input_vars.back()->IsInitialized()) {
-        InitVar(out->var_, grad_input_vars.back());
+        // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
+        InitVar(out->var_, grad_input_vars.back(),
+                platform::DeviceContextPool::Instance().Get(place));
      }
    }
    for (const VarBase* inp : inputs) {
      grad_output_vars.push_back(inp->grads_->var_);
      if (!grad_output_vars.back()->IsInitialized()) {
-        InitVar(inp->var_, grad_output_vars.back());
+        // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
+        InitVar(inp->var_, grad_output_vars.back(),
+                platform::DeviceContextPool::Instance().Get(place));
      }
    }
  }

--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace imperative {
@@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc,
 void InitVar(framework::Variable* var, framework::Variable* grad_var);
+platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
 class Tracer {
 public:
  explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
  virtual ~Tracer() {}
-  void Trace(OpBase* op,
+  void Trace(OpBase* op, const VarBasePtrMap& inputs,
-             const std::map<std::string, std::vector<VarBase*>>& inputs,
+             const VarBasePtrMap& outputs, framework::BlockDesc* block,
-             const std::map<std::string, std::vector<VarBase*>>& outputs,
+             const platform::Place expected_place,
-             framework::BlockDesc* block, const bool stop_gradient = false);
+             const bool stop_gradient = false);
  std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
                                bool stop_gradient = false);
 private:
+  platform::Place GetPlace(const VarBasePtrMap& inputs);
  framework::BlockDesc* root_block_;
 };

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/platform/variant.h"
 namespace paddle {
@@ -130,10 +131,14 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
+  DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
+                      contrib::AnalysisConfig::Precision);
  // Memory optimized related.
  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
-  DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool);
+  DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);
+  DECL_ARGUMENT_FIELD(static_memory_optim_force_update,
+                      StaticMemoryOptimForceUpdate, bool);
  // Indicate which kind of sort algorithm is used for operators, the memory
  // optimization relays on the sort algorithm.
  DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);

--- a/paddle/fluid/inference/analysis/helper.cc
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -36,6 +36,14 @@ void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
  attr->set_i(data);
 }
 template <>
+void SetAttr<bool>(framework::proto::OpDesc *op, const std::string &name,
+                   const bool &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(data);
+}
+template <>
 void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
                      const int64_t &data) {
  auto *attr = op->add_attrs();

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <sys/stat.h>
 #include <cstdio>
 #include <fstream>
+#include <set>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
@@ -29,9 +30,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/port.h"
 #ifdef _WIN32
+#include <direct.h>
+#include <io.h>
 #define GCC_ATTRIBUTE(attr__) ;
+#define MKDIR(path) _mkdir(path)
 #else
+#include <unistd.h>
 #define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
+#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)
 #endif
 #define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
@@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) {
  return false;
 }
+static std::string GetDirRoot(const std::string &path) {
+  char sep = '/';
+#ifdef _WIN32
+  sep = '\\';
+#endif
+  size_t i = path.rfind(sep, path.length());
+  if (i != std::string::npos) {
+    return (path.substr(0, i));
+  }
+  return path;
+}
+static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) {
+  std::string opt_cache_dir = model_root + "/_opt_cache/";
+  if (!PathExists(opt_cache_dir)) {
+    PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1,
+                   "Can not create optimize cache directory: %s, Make sure you "
+                   "have permission to write",
+                   opt_cache_dir);
+  }
+  return opt_cache_dir;
+}
+static std::string GetTrtCalibPath(const std::string &model_root,
+                                   const std::string &engine_key) {
+  return model_root + "/trt_calib_" + engine_key;
+}
+// If there is no calib table data file in model_opt_cache_dir, return "".
+static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
+                                        const std::string &engine_key,
+                                        bool enable_int8) {
+  std::string trt_calib_table_path =
+      GetTrtCalibPath(model_opt_cache_dir, engine_key);
+  if (enable_int8 && FileExists(trt_calib_table_path)) {
+    VLOG(3) << "Calibration table file: " << trt_calib_table_path
+            << "is found here";
+    std::ifstream infile(trt_calib_table_path, std::ios::in);
+    std::stringstream buffer;
+    buffer << infile.rdbuf();
+    std::string calibration_data(buffer.str());
+    return calibration_data;
+  }
+  return "";
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
      pass->Set("min_subgraph_size",
                new int(argument->tensorrt_min_subgraph_size()));
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
+      bool enable_int8 = argument->tensorrt_precision_mode() ==
+                         contrib::AnalysisConfig::Precision::kInt8;
+      pass->Set("enable_int8", new bool(enable_int8));
+      std::string model_opt_cache_dir =
+          argument->Has("model_dir")
+              ? argument->model_dir()
+              : GetDirRoot(argument->model_program_path());
+      pass->Set(
+          "model_opt_cache_dir",
+          new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
    }
    // graph_ = pass->Apply(std::move(graph_));
@@ -91,11 +105,14 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
 }
 framework::proto::ProgramDesc IRPassManager::AcquireProgram(
-    std::unique_ptr<Graph> *graph, const ProgramDesc &program) const {
+    std::unique_ptr<Graph> *graph, ProgramDesc *program) const {
  auto pass =
      framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
-  ProgramDesc desc(program);
+  // Direct using ProgramDesc desc(argument->main_program()) may cause
+  // incomplete copies of information.
+  ProgramDesc desc;
+  desc.CopyFrom(*program->Proto());
  pass->SetNotOwned("program", &desc);
  auto *the_graph = graph->release();
  *graph = pass->Apply(std::unique_ptr<Graph>(the_graph));

--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -29,6 +29,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 namespace paddle {
 namespace inference {
@@ -42,8 +43,8 @@ class IRPassManager final {
  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph);
-  framework::proto::ProgramDesc AcquireProgram(
+  framework::proto::ProgramDesc AcquireProgram(std::unique_ptr<Graph> *graph,
-      std::unique_ptr<Graph> *graph, const ProgramDesc &program) const;
+                                               ProgramDesc *program) const;
  framework::ir::Graph &graph() const { return *graph_; }

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <algorithm>
+#include <set>
 #include <string>
 #include <vector>
@@ -67,12 +68,33 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
  return graph;
 }
+std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
+                              const std::set<std::string> &engine_outputs) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
 void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
                                            Graph *graph) const {
  auto *op_desc = node->Op();
  auto &subgraph = *Agent(node).subgraph();
  PADDLE_ENFORCE(!subgraph.empty());
+  framework::ProgramDesc *program_desc =
+      Get<framework::ProgramDesc *>("program");
+  // Add new block for TensorRTEngineOP
+  const framework::BlockDesc &main_block =
+      program_desc->Block(framework::kRootBlockIndex);
+  // const framework::BlockDesc& main_block = program_desc->Block(0);
+  framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
  // An fake block desc.
  framework::proto::BlockDesc block_proto;
  framework::BlockDesc block_desc(nullptr, &block_proto);
@@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
                          subgraph.size());
  for (auto *node : subgraph) {
+    auto *new_block_op = new_block->AppendOp();
    auto *op = block_desc.AppendOp();
+    *new_block_op->Proto() = *node->Op()->Proto();
    *op->Proto() = *node->Op()->Proto();
  }
-  // collect inputs
+  // Then, we will use the input_names_with_id and output_names_with_id to
-  std::unordered_set<std::string> input_names;
+  // generate the eigine key.
-  std::unordered_set<std::string> input_names_with_id;
+  // So, We use set instead of unordered_set here to ensure that the engine key
+  // is unique.
+  std::set<std::string> input_names;
+  std::set<std::string> input_names_with_id;
  for (auto *x : node->inputs) {
    input_names.insert(x->Name());
    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
@@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  op_desc->SetInput(
      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
-  std::unordered_set<std::string> output_names;
+  std::set<std::string> output_names;
-  std::unordered_set<std::string> output_names_with_id;
+  std::set<std::string> output_names_with_id;
  for (auto *x : node->outputs) {
    output_names.insert(x->Name());
    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
@@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  // to Tensor.
  std::vector<std::string> output_mapping;
  for (auto name : output_names) {
-    // LOG(INFO) << name << " " << output_name_map.size();
    PADDLE_ENFORCE(output_name_map.count(name) != 0);
    output_mapping.push_back(output_name_map[name]);
  }
@@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
      *vars->Add() = *node->Var()->Proto();
    }
  }
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                 "the block has no var-desc");
  PADDLE_ENFORCE(!output_mapping.empty());
-  // Set attrs
+  op_desc->SetBlockAttr("sub_block", new_block);
  SetAttr(op_desc->Proto(), "subgraph",
          block_desc.Proto()->SerializeAsString());
+  // Set attrs
  SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
  SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
  SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+  auto enable_int8 = Get<bool>("enable_int8");
+  auto engine_key =
+      GenerateEngineKey(input_names_with_id, output_names_with_id);
+  std::string calibration_data = GetTrtCalibTableData(
+      Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
+  SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
+  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
+  SetAttr(op_desc->Proto(), "engine_key", engine_key);
 }
 std::vector<std::string> ExtractParameters(

--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass)
+cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor)
 cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)

--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
  }
  std::unique_ptr<Graph> graph(argument->main_graph_ptr());
-  framework::ProgramDesc desc(argument->main_program());
+  // Direct using ProgramDesc desc(argument->main_program()) may cause
+  // incomplete copies of information.
+  framework::ProgramDesc desc;
+  desc.CopyFrom(*argument->main_program().Proto());
  pass->SetNotOwned("program", &desc);
  auto thegraph = pass->Apply(std::move(graph));
  thegraph.release();  // the argument still own the graph.

--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -444,6 +444,26 @@ std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
  return batch_shapes;
 }
+// Replace the -1 in shape to a real number to fake the shape.
+std::vector<std::map<std::string, std::vector<int>>> FakeBatchVarShapes(
+    const framework::ProgramDesc& program) {
+  std::vector<std::map<std::string, std::vector<int>>> res;
+  res.emplace_back();
+  auto& record = res.front();
+  const int fake_batch_size = 3;
+  for (auto* var : program.Block(0).AllVars()) {
+    if (var->GetType() ==
+        framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
+      auto shape = var->GetShape();
+      for (auto& v : shape) {
+        if (v < 0) v = fake_batch_size;
+      }
+      record[var->Name()].assign(shape.begin(), shape.end());
+    }
+  }
+  return res;
+}
 // Calculate the average dim of each tensor from the batch shape cache.
 std::unordered_map<std::string, size_t> GetBatchAverageSize(
    const std::vector<std::map<std::string, std::vector<int>>>& batches) {
@@ -478,6 +498,7 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
  std::unordered_map<std::string, std::stringstream> var_batchsize_hashes;
  for (auto& batch : batches) {
    for (auto& ele : batch) {
+      PADDLE_ENFORCE(!ele.second.empty());
      int batch_size = ele.second.front();
      // TODO(Superjomn) might consume large memory here, use combine hash.
      var_batchsize_hashes[ele.first] << batch_size;
@@ -538,9 +559,21 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
 std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; }
+std::pair<size_t, size_t> GetRange(
+    const std::unordered_map<std::string, size_t>& ave_size) {
+  auto res = std::make_pair(std::numeric_limits<size_t>::max(),
+                            std::numeric_limits<size_t>::min());
+  for (auto& item : ave_size) {
+    res.first = std::min(item.second, res.first);
+    res.second = std::max(item.second, res.second);
+  }
+  return res;
+}
 void MemoryOptimizePass::RunImpl(Argument* argument) {
  // When force update, should not optimize memory.
-  if (!argument->enable_memory_optim() || argument->memory_optim_force_update())
+  if (!argument->enable_memory_optim() ||
+      argument->static_memory_optim_force_update())
    return;
  graph_ = argument->main_graph_ptr();
@@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
      argument->model_program_path_valid() ? argument->model_program_path()
                                           : "");
  VLOG(3) << "Load memory cache from " << path;
-  if (inference::IsFileExists(path)) {
+  std::vector<std::map<std::string, std::vector<int>>> batches;
-    VLOG(4) << "Performing memory optimize";
-    auto batches = DeseralizeBatchVarShapes(path);
+  if (argument->static_memory_optim() && inference::IsFileExists(path)) {
-    auto var_batch_ave_size = GetBatchAverageSize(batches);
+    string::PrettyLogInfo("--- Performing static memory optimize");
+    batches = DeseralizeBatchVarShapes(path);
+  } else {
+    string::PrettyLogInfo("--- Performing dynamic memory optimize");
+    batches = FakeBatchVarShapes(argument->main_program());
+  }
+  auto var_batch_ave_size = GetBatchAverageSize(batches);
+  // Get min and max memory size.
+  const auto range = GetRange(var_batch_ave_size);
+  const int cluster_size = std::max(
+      static_cast<int>((range.second - range.first) / 100 /*cluster num*/),
+      1024);
+  const int cluster_size1 = std::max(
+      static_cast<int>((range.second - range.first) / 1000 /*cluster num*/),
+      1024);
-    std::unordered_map<std::string, Node*> tensor_nodes;
+  std::unordered_map<std::string, Node*> tensor_nodes;
-    space_table_t space_table;
+  space_table_t space_table;
-    CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
+  CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
-    std::unordered_map<std::string, std::string> reuse_table;
+  std::unordered_map<std::string, std::string> reuse_table;
-    double max_saving_ratio = 0.;
+  double max_saving_ratio = 0.;
-    std::vector<std::function<MemoryAllocation()>> strategies;
+  std::vector<std::function<MemoryAllocation()>> strategies;
-    for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
+  for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
+    if (argument->static_memory_optim()) {
+      // This strategy only make scene in static memory optimize.
      strategies.emplace_back([&, sort_kind] {
        auto clustered_vars_by_batch_size =
            AnalysisBatchShapesByBatchSize(batches);
@@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
                      space_table, &reuse_table, sort_kind, &allocation);
        return allocation;
      });
+    }
-      strategies.emplace_back([&, sort_kind] {
+    strategies.emplace_back([&, sort_kind] {
-        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+      auto clustered_vars_by_ave_size =
-            space_table, batches, 1024);  // interval 1kb
+          AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size);
-        MemoryAllocation allocation;
+      MemoryAllocation allocation;
-        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
+      MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
-                      space_table, &reuse_table, sort_kind, &allocation);
+                    &reuse_table, sort_kind, &allocation);
-        return allocation;
+      return allocation;
-      });
+    });
+    strategies.emplace_back([&, sort_kind] {
+      auto clustered_vars_by_ave_size =
+          AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size1);
+      MemoryAllocation allocation;
+      MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
+                    &reuse_table, sort_kind, &allocation);
+      return allocation;
+    });
+    strategies.emplace_back([&, sort_kind] {
+      auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+          space_table, batches,
+          std::numeric_limits<int>::max());  // no intervals
+      MemoryAllocation allocation;
+      MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
+                    &reuse_table, sort_kind, &allocation);
+      return allocation;
+    });
+  }
-      strategies.emplace_back([&, sort_kind] {
+  std::function<MemoryAllocation()>* best_strategy{nullptr};
-        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
-            space_table, batches, 1024 * 1024);  // interval 1MB
-        MemoryAllocation allocation;
-        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
-                      space_table, &reuse_table, sort_kind, &allocation);
-        return allocation;
-      });
-      strategies.emplace_back([&, sort_kind] {
+  // Try all strategies to get the best result.
-        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+  for (auto& strategy : strategies) {
-            space_table, batches,
+    auto allocation = strategy();
-            std::numeric_limits<int>::max());  // no intervals
+    string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
-        MemoryAllocation allocation;
+                            allocation.GetSavingRatio());
-        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
+    if (allocation.GetSavingRatio() > max_saving_ratio) {
-                      space_table, &reuse_table, sort_kind, &allocation);
+      max_saving_ratio = allocation.GetSavingRatio();
-        return allocation;
+      best_strategy = &strategy;
-      });
    }
+  }
+  if (!best_strategy) {
+    LOG(ERROR) << "This model makes poor memory optimize, skip memory optimize";
+    return;
+  }
+  auto memory_allocation = (*best_strategy)();
-    std::function<MemoryAllocation()>* best_strategy{nullptr};
+  string::PrettyLogInfo(
+      "--- Saved %.2f%s memory for workspace(temporary variables)",
+      memory_allocation.GetSavingRatio() * 100, "%");
-    // Try all strategies to get the best result.
+  argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
-    for (auto& strategy : strategies) {
+                             new std::unordered_set<std::string>);
-      auto allocation = strategy();
+  auto& vars2remove =
-      string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
+      argument->main_graph().Get<std::unordered_set<std::string>>(
-                              allocation.GetSavingRatio());
+          framework::ir::kGraphToProgramVarsToRemove);
-      if (allocation.GetSavingRatio() > max_saving_ratio) {
-        max_saving_ratio = allocation.GetSavingRatio();
+  PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
-        best_strategy = &strategy;
+  argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
-      }
-    }
-    if (!best_strategy) {
-      LOG(ERROR)
-          << "This model makes poor memory optimize, skip memory optimize";
-      return;
-    }
-    auto memory_allocation = (*best_strategy)();
-    string::PrettyLogH2(
-        "--- Saved %.2f%s memory for workspace(temporary variables)",
-        memory_allocation.GetSavingRatio() * 100, "%");
-    string::PrettyLogDetail("--- Allocated %d MB",
-                            memory_allocation.allocated / 1024. / 1024.);
-    string::PrettyLogDetail("--- Saved %d MB",
-                            memory_allocation.saved / 1024. / 1024.);
-    argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
-                               new std::unordered_set<std::string>);
-    auto& vars2remove =
-        argument->main_graph().Get<std::unordered_set<std::string>>(
-            framework::ir::kGraphToProgramVarsToRemove);
-    PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
-    argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
-  }
 }
 float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const {

--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -15,7 +15,7 @@
 #pragma once
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -95,12 +95,14 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
  CP_MEMBER(memory_pool_init_size_mb_);
  CP_MEMBER(enable_memory_optim_);
-  CP_MEMBER(memory_optim_force_update_);
+  CP_MEMBER(static_memory_optim_);
+  CP_MEMBER(static_memory_optim_force_update_);
  // TensorRT releated.
  CP_MEMBER(use_tensorrt_);
  CP_MEMBER(tensorrt_workspace_size_);
  CP_MEMBER(tensorrt_max_batchsize_);
  CP_MEMBER(tensorrt_min_subgraph_size_);
+  CP_MEMBER(tensorrt_precision_mode_);
  // MKLDNN releated.
  CP_MEMBER(use_mkldnn_);
  CP_MEMBER(mkldnn_enabled_op_types_);
@@ -140,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
  Update();
 }
-void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
+void contrib::AnalysisConfig::EnableTensorRtEngine(
-                                                   int max_batch_size,
+    int workspace_size, int max_batch_size, int min_subgraph_size,
-                                                   int min_subgraph_size) {
+    contrib::AnalysisConfig::Precision precision_mode) {
 #ifdef PADDLE_WITH_CUDA
  if (!use_gpu()) {
    LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
@@ -153,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
  tensorrt_workspace_size_ = workspace_size;
  tensorrt_max_batchsize_ = max_batch_size;
  tensorrt_min_subgraph_size_ = min_subgraph_size;
+  tensorrt_precision_mode_ = precision_mode;
  Update();
 #else
@@ -238,7 +241,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() {
  ss << tensorrt_min_subgraph_size_;
  ss << enable_memory_optim_;
-  ss << memory_optim_force_update_;
+  ss << static_memory_optim_;
+  ss << static_memory_optim_force_update_;
  ss << use_mkldnn_;
  for (auto &item : mkldnn_enabled_op_types_) ss << item;
@@ -278,9 +282,11 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #endif
 }
-void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) {
+void contrib::AnalysisConfig::EnableMemoryOptim(
+    bool static_optim, bool force_update_static_cache) {
  enable_memory_optim_ = true;
-  memory_optim_force_update_ = force_update_cache;
+  static_memory_optim_ = static_optim;
+  static_memory_optim_force_update_ = force_update_static_cache;
  Update();
 }
@@ -300,4 +306,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
  Update();
 }
+NativeConfig contrib::AnalysisConfig::ToNativeConfig() const {
+  NativeConfig config;
+  config.model_dir = model_dir_;
+  config.prog_file = prog_file_;
+  config.param_file = params_file_;
+  config.use_gpu = use_gpu_;
+  config.device = device_id_;
+  config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
+  config.specify_input_name = specify_input_name_;
+  return config;
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <glog/logging.h>
 #include <algorithm>
+#include <fstream>
 #include <memory>
 #include <string>
 #include <vector>
@@ -25,6 +26,7 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -37,6 +39,8 @@
 #if PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #endif
 DECLARE_bool(profile);
@@ -44,6 +48,12 @@ DECLARE_bool(profile);
 namespace paddle {
 using contrib::AnalysisConfig;
+using inference::Singleton;
+#if PADDLE_WITH_TENSORRT
+using inference::tensorrt::TRTInt8Calibrator;
+using inference::tensorrt::TRTCalibratorEngine;
+using inference::tensorrt::TRTCalibratorEngineManager;
+#endif
 namespace {
 bool IsPersistable(const framework::VarDesc *var) {
@@ -113,6 +123,15 @@ bool AnalysisPredictor::PrepareProgram(
  if (!program) {
    if (!LoadProgramDesc()) return false;
+    // If not cloned, the parameters should be loaded.
+    // If config_.ir_optim() is True, parameters is loaded in
+    // OptimizeInferenceProgram(), but other persistable variables
+    // (like RAW type var) are not created in scope.
+    // If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
+    // still need to create other persistable variables.
+    // So in both case, create persistable variables at first.
+    executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
    // Optimize the program, and load parameters and modify them in the
    // scope_.
    // This will change the scope_ address.
@@ -120,15 +139,6 @@ bool AnalysisPredictor::PrepareProgram(
      status_ir_optim_enabled_ = true;
      OptimizeInferenceProgram();
    } else {
-      // If the parent_scope is passed, we assert that the persistable variables
-      // are already created, so just create the no persistable variables.
-      // If not cloned, the parameters should be loaded
-      // OptimizeInferenceProgram.
-      // So in both cases, just the local variables are needed to load, not the
-      // parematers.
-      executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
      // Load parameters
      LOG(INFO) << "load parameters ";
      LoadParameters();
@@ -298,15 +308,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
 bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                 framework::Scope *scope) {
  VLOG(3) << "Predictor::get_fetch";
-  outputs->resize(fetchs_.size());
+  outputs->resize(fetches_.size());
-  for (size_t i = 0; i < fetchs_.size(); ++i) {
+  for (size_t i = 0; i < fetches_.size(); ++i) {
-    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
+    int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
    PADDLE_ENFORCE((size_t)idx == i);
    framework::LoDTensor &fetch =
        framework::GetFetchVariable(*scope, "fetch", idx);
    auto type = fetch.type();
    auto output = &(outputs->at(i));
-    output->name = fetchs_[idx]->Input("X")[0];
+    output->name = fetches_[idx]->Input("X")[0];
    if (type == framework::proto::VarType::FP32) {
      GetFetchOne<float>(fetch, output);
      output->dtype = PaddleDType::FLOAT32;
@@ -327,7 +337,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  argument_.SetUseGPU(config_.use_gpu());
  argument_.SetGPUDeviceId(config_.gpu_device_id());
  argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
-  argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_);
+  argument_.SetStaticMemoryOptim(config_.static_memory_optim_);
+  argument_.SetStaticMemoryOptimForceUpdate(
+      config_.static_memory_optim_force_update_);
  argument_.SetModelFromMemory(config_.model_from_memory_);
  // Analyze inference_program
  if (!config_.model_dir().empty()) {
@@ -337,6 +349,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
        !config_.params_file().empty(),
        "Either model_dir or (param_file, prog_file) should be set.");
    PADDLE_ENFORCE(!config_.prog_file().empty());
+    std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
    argument_.SetModelProgramPath(config_.prog_file());
    argument_.SetModelParamsPath(config_.params_file());
  }
@@ -347,6 +361,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
    argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
    argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
    argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
+    argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
  }
  if (config_.use_mkldnn_) {
@@ -361,7 +376,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  }
  argument_.SetIrAnalysisPasses(passes);
  argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
-  argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
+  argument_.SetScopeNotOwned(scope_.get());
  Analyzer().Run(&argument_);
  PADDLE_ENFORCE(argument_.scope_valid());
@@ -422,10 +437,10 @@ void AnalysisPredictor::PrepareFeedFetch() {
      feed_names_[op->Output("Out")[0]] = idx;
    } else if (op->Type() == "fetch") {
      int idx = boost::get<int>(op->GetAttr("col"));
-      if (fetchs_.size() <= static_cast<size_t>(idx)) {
+      if (fetches_.size() <= static_cast<size_t>(idx)) {
-        fetchs_.resize(idx + 1);
+        fetches_.resize(idx + 1);
      }
-      fetchs_[idx] = op;
+      fetches_[idx] = op;
    }
  }
 }
@@ -567,7 +582,67 @@ bool AnalysisPredictor::LoadParameters() {
  return true;
 }
+#if PADDLE_WITH_TENSORRT
+bool AnalysisPredictor::SaveTrtCalibToDisk() {
+  PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
+                 "This func can be invoked only in trt mode");
+  auto &block = inference_program_->Block(0);
+  for (auto &op_desc : block.AllOps()) {
+    if (op_desc->Type() == "tensorrt_engine") {
+      std::string engine_name =
+          boost::get<std::string>(op_desc->GetAttr("engine_key"));
+      if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_name)) {
+        LOG(ERROR) << "You should run the predictor(with trt) on the real data "
+                      "to generate calibration info";
+        return false;
+      }
+      TRTCalibratorEngine *calib_engine =
+          Singleton<TRTCalibratorEngineManager>::Global().Get(engine_name);
+      LOG(INFO) << "Wait for calib threads done.";
+      calib_engine->calib_->waitAndSetDone();
+      LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot "
+                   "of time...";
+      calib_engine->thr_->join();
+      std::string calibration_table_data =
+          calib_engine->calib_->getCalibrationTableAsString();
+      if (calibration_table_data.empty()) {
+        LOG(ERROR) << "the calibration table is empty.";
+        return false;
+      }
+      std::string model_opt_cache_dir =
+          argument_.Has("model_dir")
+              ? argument_.model_dir()
+              : inference::analysis::GetDirRoot(argument_.model_program_path());
+      std::string calibration_table_data_path =
+          inference::analysis::GetTrtCalibPath(
+              inference::analysis::GetOrCreateModelOptCacheDir(
+                  model_opt_cache_dir),
+              engine_name);
+      std::ofstream ofile(calibration_table_data_path, std::ios::out);
+      LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file "
+                << calibration_table_data_path;
+      ofile << calibration_table_data;
+      ofile.close();
+    }
+  }
+  // Free all calibrator resources.
+  Singleton<TRTCalibratorEngineManager>::Global().DeleteALL();
+  return true;
+}
+#endif
 AnalysisPredictor::~AnalysisPredictor() {
+#if PADDLE_WITH_TENSORRT
+  if (config_.tensorrt_engine_enabled() &&
+      config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
+      Singleton<TRTCalibratorEngineManager>::Global().Has()) {
+    SaveTrtCalibToDisk();
+  }
+#endif
  if (FLAGS_profile) {
    platform::DisableProfiler(platform::EventSortingKey::kTotal,
                              "./profile.log");
@@ -638,12 +713,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
  // check if the cache exists
  if (!config_.enable_memory_optim()) {
    need = false;
-  } else if (config_.enable_memory_optim() &&
+  } else if (config_.static_memory_optim_ &&
             !inference::IsFileExists(inference::analysis::GetMemoryCachePath(
                 config_.model_dir(), config_.prog_file()))) {
    need = true;
-  } else if (config_.enable_memory_optim() &&
+  } else if (config_.static_memory_optim_ &&
-             config_.memory_optim_force_update_) {
+             config_.static_memory_optim_force_update_) {
    need = true;
  }
@@ -651,6 +726,10 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
  return need;
 }
+std::string AnalysisPredictor::GetSeriazlizedProgram() const {
+  return inference_program_->Proto()->SerializeAsString();
+}
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
    const contrib::AnalysisConfig &config) {

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -75,6 +75,8 @@ class AnalysisPredictor : public PaddlePredictor {
  void SetMkldnnThreadID(int tid);
+  std::string GetSeriazlizedProgram() const override;
 protected:
  // For memory optimization.
  bool need_collect_var_shapes_for_memory_optim();
@@ -97,6 +99,21 @@ class AnalysisPredictor : public PaddlePredictor {
  void GetFetchOne(const framework::LoDTensor &fetchs,
                   PaddleTensor *output_data);
+#if PADDLE_WITH_TENSORRT
+  // When we use Paddle-TRT INT8 engine, we need to generate calibration table
+  // data first,
+  // the calibration table contains the range for each op's input and output,
+  // this whole process can be divided into several steps:
+  //
+  // 1. Builds a 32-bit engine, runs it on the calibration set, and records a
+  // histogram for each
+  // tensor of the distribution of activation values.
+  // 2. Builds a calibration table from the histograms.
+  //
+  // After step 2, we need to store the calibration table on disk
+  bool SaveTrtCalibToDisk();
+#endif
 // Some more detailed tests, they are made the friends of the predictor, so that
 // the all the details can be tested.
 #if PADDLE_WITH_TESTING
@@ -115,7 +132,7 @@ class AnalysisPredictor : public PaddlePredictor {
  std::shared_ptr<framework::ProgramDesc> inference_program_;
  std::vector<framework::OpDesc *> feeds_;
  std::map<std::string, size_t> feed_names_;
-  std::vector<framework::OpDesc *> fetchs_;
+  std::vector<framework::OpDesc *> fetches_;
  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
  // concurrency problems, wrong results and memory leak, so cache them.
  std::vector<framework::LoDTensor> feed_tensors_;

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -215,6 +215,8 @@ TEST(AnalysisPredictor, memory_optim) {
  {
    // The first predictor help to cache the memory optimize strategy.
    auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+    LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram();
+    ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty());
    // Run several times to check the parameters are not reused by mistake.
    for (int i = 0; i < 5; i++) {

--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <sstream>
+#include "paddle/fluid/framework/commit.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -97,4 +99,12 @@ void PaddleBuf::Free() {
  }
 }
+std::string get_version() {
+  std::stringstream ss;
+  ss << "version: " << framework::paddle_version() << "\n";
+  ss << "commit: " << framework::paddle_commit() << "\n";
+  ss << "branch: " << framework::paddle_compile_branch() << "\n";
+  return ss.str();
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/api_tester.cc
+++ b/paddle/fluid/inference/api/api_tester.cc
@@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) {
  predictor->Run({}, &outputs);
 }
+TEST(paddle_inference_api, get_version) {
+  LOG(INFO) << "paddle version:\n" << get_version();
+  auto version = get_version();
+  ASSERT_FALSE(version.empty());
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -42,6 +42,10 @@ struct AnalysisConfig {
  explicit AnalysisConfig(const std::string& model_dir);
  explicit AnalysisConfig(const std::string& prog_file,
                          const std::string& params_file);
+  enum class Precision {
+    kFloat32 = 0,
+    kInt8,
+  };
  /** Set model with a directory.
   */
@@ -135,7 +139,8 @@ struct AnalysisConfig {
   * subgraph is less than this, it will not transfer to TensorRT engine.
   */
  void EnableTensorRtEngine(int workspace_size = 1 << 20,
-                            int max_batch_size = 1, int min_subgraph_size = 3);
+                            int max_batch_size = 1, int min_subgraph_size = 3,
+                            Precision precision = Precision::kFloat32);
  /** A boolean state telling whether the TensorRT engine is used.
   */
  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
@@ -162,17 +167,7 @@ struct AnalysisConfig {
  /** Transform the AnalysisConfig to NativeConfig.
   */
-  NativeConfig ToNativeConfig() const {
+  NativeConfig ToNativeConfig() const;
-    NativeConfig config;
-    config.model_dir = model_dir_;
-    config.prog_file = prog_file_;
-    config.param_file = params_file_;
-    config.use_gpu = use_gpu_;
-    config.device = device_id_;
-    config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
-    config.specify_input_name = specify_input_name_;
-    return config;
-  }
  /** Specify the operator type list to use MKLDNN acceleration.
   * @param op_list the operator type list.
   */
@@ -195,7 +190,8 @@ struct AnalysisConfig {
  /** Turn on memory optimize
   * NOTE still in development, will release latter.
   */
-  void EnableMemoryOptim(bool force_update_cache = false);
+  void EnableMemoryOptim(bool static_optim = false,
+                         bool force_update_static_cache = false);
  /** Tell whether the memory optimization is activated. */
  bool enable_memory_optim() const;
@@ -238,10 +234,12 @@ struct AnalysisConfig {
  //  We set this variable to control the minimum number of nodes in the
  //  subgraph, 3 as default value.
  int tensorrt_min_subgraph_size_{3};
+  Precision tensorrt_precision_mode_;
  // memory reuse related.
  bool enable_memory_optim_{false};
-  bool memory_optim_force_update_{false};
+  bool static_memory_optim_{false};
+  bool static_memory_optim_force_update_{false};
  bool use_mkldnn_{false};
  std::unordered_set<std::string> mkldnn_enabled_op_types_;

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -215,6 +215,14 @@ class PaddlePredictor {
   */
  virtual ~PaddlePredictor() = default;
+  /** \brief Get the serialized model program that executes in inference phase.
+   * Its data type is ProgramDesc, which is a protobuf message.
+   */
+  virtual std::string GetSeriazlizedProgram() const {
+    assert(false);  // Force raise error.
+    return "NotImplemented";
+  };
  /** The common configs for all the predictors.
   */
  struct Config {
@@ -288,4 +296,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 int PaddleDtypeSize(PaddleDType dtype);
+std::string get_version();
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy {
 public:
  GpuPassStrategy() : PassStrategy({}) {
    passes_.assign({
-        "infer_clean_graph_pass",                    //
+      "infer_clean_graph_pass",                        //
-        "conv_affine_channel_fuse_pass",             //
+          "conv_affine_channel_fuse_pass",             //
-        "conv_eltwiseadd_affine_channel_fuse_pass",  //
+          "conv_eltwiseadd_affine_channel_fuse_pass",  //
-        "conv_bn_fuse_pass",                         //
+          "conv_bn_fuse_pass",                         //
-        "conv_elementwise_add_act_fuse_pass",        //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
-        "conv_elementwise_add2_act_fuse_pass",       //
+                           // guaranteed at least v7
-        "conv_elementwise_add_fuse_pass",            //
+          "conv_elementwise_add_act_fuse_pass",   //
+          "conv_elementwise_add2_act_fuse_pass",  //
+          "conv_elementwise_add_fuse_pass",       //
+#endif
    });
    for (int i = 6; i >= 3; i--) {

--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
-nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
+nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
 nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -69,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() {
  // build engine.
  infer_builder_->setMaxBatchSize(max_batch_);
  infer_builder_->setMaxWorkspaceSize(max_workspace_);
+  if (enable_int8_) {
+    infer_builder_->setInt8Mode(true);
+    PADDLE_ENFORCE(
+        calibrator_ != nullptr,
+        "The precision mode is 'INT8', the calibrator should not be nullptr");
+    infer_builder_->setInt8Calibrator(calibrator_);
+  }
  infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -23,12 +23,14 @@ limitations under the License. */
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+class TRTInt8Calibrator;
 /*
 * TensorRT Engine.
 *
@@ -55,13 +57,16 @@ class TensorRTEngine : public EngineBase {
  };
  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream,
-                 int device = 0,
+                 int device = 0, bool enable_int8 = false,
+                 TRTInt8Calibrator* calibrator = nullptr,
                 nvinfer1::ILogger& logger = NaiveLogger::Global())
      : max_batch_(max_batch),
        max_workspace_(max_workspace),
        stream_(stream),
-        logger_(logger),
+        device_(device),
-        device_(device) {}
+        enable_int8_(enable_int8),
+        calibrator_(calibrator),
+        logger_(logger) {}
  virtual ~TensorRTEngine();
@@ -139,8 +144,8 @@ class TensorRTEngine : public EngineBase {
  // In the normal case, the paddle-trt exists bug when runing the googlenet.
  // When there are more than two convolutions of 1 * 1 with the same input, the
  // paddle-tensorrt will do the merging optimization, which fuse those conv
-  // into
+  // into one conv, and then trigger bug. So,  We should use strategy to avoid
-  // one conv, and then trigger bug. So,  We should use strategy to avoid this
+  // this
  // optimization for the time being. This bug will be fixed in the future.
  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
      itensor_quote_num;
@@ -153,9 +158,14 @@ class TensorRTEngine : public EngineBase {
  // the max memory size the engine uses
  int max_workspace_;
+  cudaStream_t stream_;
+  // The specific GPU id that the TensorRTEngine bounded to.
+  int device_;
+  bool enable_int8_;
+  TRTInt8Calibrator* calibrator_;
  // batch size of the current data, will be updated each Executation.
  int batch_size_{-1};
-  cudaStream_t stream_;
  nvinfer1::ILogger& logger_;
@@ -165,8 +175,6 @@ class TensorRTEngine : public EngineBase {
  std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
      itensor_map_;
-  // The specific GPU id that the TensorRTEngine bounded to.
-  int device_;
  std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
  // TensorRT related internal members

--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
+#include "glog/logging.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+// set the batch size before constructing the thread to execute engine
+int TRTInt8Calibrator::getBatchSize() const { return batch_size_; }
+TRTInt8Calibrator::TRTInt8Calibrator(
+    const std::unordered_map<std::string, size_t>& buffers, int batch_size,
+    std::string engine_name, const platform::Place place)
+    : batch_size_(batch_size), engine_name_(engine_name) {
+  int i = 0;
+  VLOG(4) << "Init a new calibrator: " << engine_name_;
+  for (const auto it : buffers) {
+    framework::Tensor temp_tensor;
+    std::string input_name = it.first;
+    int data_size = it.second;
+    int num_ele = data_size / sizeof(int16_t);
+    framework::DDim data_shape = framework::make_ddim({num_ele});
+    temp_tensor.Resize(data_shape);
+    data_tensors_.push_back(temp_tensor);
+    data_buffers_[input_name] = std::pair<void*, size_t>(
+        static_cast<void*>(temp_tensor.mutable_data<int16_t>(place)), num_ele);
+    i += 1;
+  }
+}
+TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data)
+    : batch_size_(0),
+      calib_running_(false),
+      data_is_set_(false),
+      done_(true),
+      calibration_table_(calib_data) {}
+void TRTInt8Calibrator::waitAndSetDone() {
+  std::unique_lock<std::mutex> lk(mut_);
+  while ((calib_running_ || data_is_set_) && !done_) cond_.wait(lk);
+  if (!done_) {
+    done_ = true;
+    cond_.notify_all();
+  }
+}
+// There might be more than one input for trt subgraph,
+// So, we use a map to store input information.
+bool TRTInt8Calibrator::setBatch(
+    const std::unordered_map<std::string, void*>& data) {
+  VLOG(3) << "set batch: " << engine_name_;
+  std::unique_lock<std::mutex> lk(mut_);
+  //  There is a producer and a consumer. The producer set the batch data and
+  //  the consumer get the batch data. The size of the data pool is one.
+  //  So, the producer has to wait for the consumer to finish processing before
+  //  they can set the data.
+  while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk);
+  // The done_ is set to true using waitAndSetDone, When all calibration data
+  // are processed.
+  if (done_) return false;
+  // Sets the batch.
+  for (const auto& it : data) {
+    auto dataptr = data_buffers_.find(it.first);
+    if (dataptr == data_buffers_.end()) {
+      LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first
+                 << "' does not match with the buffer names";
+    }
+    const auto& d = dataptr->second;
+    PADDLE_ENFORCE(
+        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
+        "Fail to cudaMemcpy %s for %s", engine_name_, it.first);
+  }
+  data_is_set_ = true;
+  cond_.notify_all();
+  return true;
+}
+bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
+                                 int num_bindings) {
+  VLOG(4) << "get batch: " << engine_name_;
+  std::unique_lock<std::mutex> lk(mut_);
+  // The consumer has just finished processing a data.
+  // The producer can set the data again.
+  calib_running_ = false;
+  cond_.notify_all();
+  // As long as there is data in the pool, the consumer can get it.
+  while (!data_is_set_ && !done_) cond_.wait(lk);
+  if (done_) return false;
+  // Gets the batch
+  for (int i = 0; i < num_bindings; i++) {
+    auto it = data_buffers_.find(names[i]);
+    if (it == data_buffers_.end()) {
+      LOG(FATAL) << "Calibration engine asked for unknown tensor name '"
+                 << names[i] << "' at position " << i;
+    }
+    bindings[i] = it->second.first;
+  }
+  data_is_set_ = false;
+  calib_running_ = true;
+  VLOG(4) << "get batch done: " << engine_name_;
+  return true;
+}
+void TRTInt8Calibrator::setDone() {
+  std::unique_lock<std::mutex> lk(mut_);
+  done_ = true;
+  cond_.notify_all();
+}
+const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) {
+  if (calibration_table_.empty()) return nullptr;
+  length = calibration_table_.size();
+  return calibration_table_.data();
+}
+void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
+                                              std::size_t length) {
+  calibration_table_ = std::string((const char*)ptr, length);
+  VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr
+          << " length=" << length;
+}
+TRTInt8Calibrator::~TRTInt8Calibrator() {
+  VLOG(4) << "Destroying calibrator for " << engine_name_;
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include <NvInfer.h>
+#include <cuda_runtime_api.h>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+class TensorRTEngine;
+struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
+ public:
+  TRTInt8Calibrator(const std::unordered_map<std::string, size_t>& buffers,
+                    int batch_size, std::string engine_name,
+                    const platform::Place place);
+  explicit TRTInt8Calibrator(const std::string& calibration_data);
+  ~TRTInt8Calibrator();
+  int getBatchSize() const override;
+  bool getBatch(void* bindings[], const char* names[],
+                int num_bindings) override;
+  bool setBatch(const std::unordered_map<std::string, void*>& data);
+  void setDone();
+  void waitAndSetDone();
+  const void* readCalibrationCache(std::size_t& length) override;
+  void writeCalibrationCache(const void* ptr, std::size_t length) override;
+  const std::string& getCalibrationTableAsString() {
+    return calibration_table_;
+  }
+ private:
+  const int batch_size_;
+  bool calib_running_{true};
+  bool data_is_set_{false};
+  bool done_{false};
+  std::mutex mut_;
+  std::condition_variable cond_;
+  std::unordered_map<std::string, std::pair<void*, size_t>> data_buffers_;
+  std::vector<framework::Tensor> data_tensors_;
+  std::string engine_name_;
+  std::string calibration_table_;
+};
+class TRTCalibratorEngine {
+ public:
+  TRTCalibratorEngine() {}
+  std::unique_ptr<TRTInt8Calibrator> calib_;
+  std::unique_ptr<std::thread> thr_;
+  std::unique_ptr<TensorRTEngine> engine_;
+};
+/*
+ * Manager to control the TensorRT Int8 calibration creation and deltetion.
+ */
+class TRTCalibratorEngineManager {
+ public:
+  bool Has() const { return res_.size() > 0; }
+  bool Has(const std::string& name) const {
+    if (res_.count(name) == 0) return false;
+    return res_.at(name).get() != nullptr;
+  }
+  // Get Int8Calibrator via name
+  TRTCalibratorEngine* Get(const std::string& name) const {
+    return res_.at(name).get();
+  }
+  // Look up or create a calibrator.
+  TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) {
+    if (res_.count(engine_name) == 0) {
+      auto* p = new TRTCalibratorEngine;
+      res_[engine_name].reset(p);
+    }
+    return res_.at(engine_name).get();
+  }
+  // Create an Int8Calibrator
+  TRTCalibratorEngine* Create(const std::string& engine_name) {
+    auto* p = new TRTCalibratorEngine;
+    res_[engine_name].reset(p);
+    return p;
+  }
+  void DeleteALL() {
+    for (auto& item : res_) {
+      item.second.reset(nullptr);
+    }
+  }
+ private:
+  std::unordered_map<std::string, std::unique_ptr<TRTCalibratorEngine>> res_;
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -54,6 +54,7 @@ else()
    message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1")
 endif()
 # RNN2
 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
@@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
+# googlenet
+inference_analysis_api_test_with_fake_data(test_analyzer_googlenet
+  "${INFERENCE_DEMO_INSTALL_DIR}/googlenet" analyzer_resnet50_tester.cc "googlenet.tar.gz" SERIAL)
 # resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL)

--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -253,7 +253,7 @@ void compare(bool use_mkldnn = false) {
 }
 // Compare result of NativeConfig and AnalysisConfig with memory optimization.
-TEST(Analyzer_dam, compare_with_memory_optim) {
+TEST(Analyzer_dam, compare_with_static_memory_optim) {
  // The small dam will core in CI, but works in local.
  if (FLAGS_max_turn_num == 9) {
    contrib::AnalysisConfig cfg, cfg1;
@@ -263,7 +263,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
    SetInput(&input_slots_all);
    // Run the first time to force to update memory cache
    SetConfig(&cfg);
-    cfg.EnableMemoryOptim(true);
+    cfg.EnableMemoryOptim(true, true /*force update*/);
    CompareNativeAndAnalysis(
        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
@@ -271,7 +271,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
    // Run second time to use the memory cache and perform memory optimization.
    SetConfig(&cfg1);
-    cfg1.EnableMemoryOptim();
+    cfg1.EnableMemoryOptim(true, false /*do not force update*/);
    CompareNativeAndAnalysis(
        reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
@@ -279,6 +279,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
  }
 }
+TEST(Analyzer_dam, compare_with_dynamic_memory_optim) {
+  // The small dam will core in CI, but works in local.
+  if (FLAGS_max_turn_num == 9) {
+    contrib::AnalysisConfig cfg, cfg1;
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    SetInput(&input_slots_all);
+    // Run the first time to force to update memory cache
+    SetConfig(&cfg);
+    cfg.EnableMemoryOptim();
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+        input_slots_all);
+  }
+}
 TEST(Analyzer_dam, compare) { compare(); }
 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include <string>
+#include <utility>
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
@@ -37,7 +38,7 @@ template <typename Place>
 void *Alloc(const Place &place, size_t size);
 template <typename Place>
-void Free(const Place &place, void *p);
+void Free(const Place &place, void *p, size_t size);
 template <typename Place>
 size_t Used(const Place &place);
@@ -52,6 +53,11 @@ size_t memory_usage(const platform::Place &p);
 using BuddyAllocator = detail::BuddyAllocator;
+std::unordered_map</*device id*/ int,
+                   std::pair</*current memory usage*/ uint64_t,
+                             /*peak memory usage*/ uint64_t>>
+    gpu_mem_info;
 BuddyAllocator *GetCPUBuddyAllocator() {
  // We tried thread_local for inference::RNN1 model, but that not works much
  // for multi-thread test.
@@ -98,7 +104,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
 }
 template <>
-void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) {
+void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
+                              size_t size) {
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
 }
@@ -177,9 +184,16 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
    LOG(WARNING) << "GPU memory used: "
                 << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
    platform::SetDeviceId(cur_dev);
-  }
+  } else {
-  if (FLAGS_init_allocated_mem) {
+    gpu_mem_info[place.device].first += size;
-    cudaMemset(ptr, 0xEF, size);
+    if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) {
+      gpu_mem_info[place.device].second = gpu_mem_info[place.device].first;
+      VLOG(3) << "device: " << place.device << " peak memory usage : "
+              << (gpu_mem_info[place.device].second >> 20) << " MiB";
+    }
+    if (FLAGS_init_allocated_mem) {
+      cudaMemset(ptr, 0xEF, size);
+    }
  }
  return ptr;
 #else
@@ -188,9 +202,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 }
 template <>
-void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p) {
+void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
+                               size_t size) {
 #ifdef PADDLE_WITH_CUDA
  GetGPUBuddyAllocator(place.device)->Free(p);
+  gpu_mem_info[place.device].first -= size;
 #else
  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
@@ -243,7 +259,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
-                                     void *p) {
+                                     void *p, size_t size) {
 #ifdef PADDLE_WITH_CUDA
  GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -264,15 +280,17 @@ struct AllocVisitor : public boost::static_visitor<void *> {
 };
 struct FreeVisitor : public boost::static_visitor<void> {
-  inline explicit FreeVisitor(void *ptr) : ptr_(ptr) {}
+  inline explicit FreeVisitor(void *ptr, size_t size)
+      : ptr_(ptr), size_(size) {}
  template <typename Place>
  inline void operator()(const Place &place) const {
-    Free<Place>(place, ptr_);
+    Free<Place>(place, ptr_, size_);
  }
 private:
  void *ptr_;
+  size_t size_;
 };
 size_t Usage::operator()(const platform::CPUPlace &cpu) const {
@@ -304,8 +322,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
 }
 void LegacyAllocator::Free(Allocation *allocation) {
-  boost::apply_visitor(legacy::FreeVisitor(allocation->ptr()),
+  boost::apply_visitor(
-                       allocation->place());
+      legacy::FreeVisitor(allocation->ptr(), allocation->size()),
+      allocation->place());
  delete allocation;
 }
 }  // namespace allocation

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -13,6 +13,7 @@ add_subdirectory(detection)
 add_subdirectory(elementwise)
 add_subdirectory(fused)
 add_subdirectory(metrics)
+add_subdirectory(ngraph)
 add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
@@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search)
 if (WITH_GPU)
  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
 endif()
@@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
-cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)

--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <algorithm>
+#include "paddle/fluid/operators/beam_search_op.h"
-#include <map>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/beam_search_op.h"
 namespace paddle {
 namespace operators {
-void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
-                            const framework::LoDTensor &pre_scores,
-                            framework::LoDTensor *selected_ids,
-                            framework::LoDTensor *selected_scores) {
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
-  auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
-  auto selected_items = ToMap(items, high_level.back());
-  VLOG(3) << "selected_items:";
-  for (size_t i = 0; i < selected_items.size(); ++i) {
-    VLOG(3) << "offset:" << i;
-    for (auto &item : selected_items[i]) {
-      VLOG(3) << ItemToString(item);
-    }
-  }
-  PruneEndBeams(pre_ids, &selected_items);
-  // calculate the output tensor's height
-  size_t num_instances = std::accumulate(
-      std::begin(selected_items), std::end(selected_items), 0,
-      [](size_t a, std::vector<Item> &b) { return a + b.size(); });
-  // the output tensor shape should be [num_instances, 1]
-  auto dims = framework::make_ddim(
-      std::vector<int64_t>({static_cast<int>(num_instances), 1}));
-  selected_ids->Resize(dims);
-  selected_scores->Resize(dims);
-  std::map<size_t /*offset*/, std::vector<Item>> hash;
-  framework::LoD new_lod;
-  auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
-  auto *scores_data =
-      selected_scores->mutable_data<float>(platform::CPUPlace());
-  // fill in data
-  std::vector<size_t> low_level;
-  size_t low_offset = 0;
-  for (auto &items : selected_items) {
-    low_level.push_back(low_offset);
-    for (auto &item : items) {
-      ids_data[low_offset] = item.id;
-      scores_data[low_offset] = item.score;
-      low_offset++;
-    }
-  }
-  low_level.push_back(low_offset);
-  // fill lod
-  framework::LoD lod(2);
-  lod[0].assign(high_level.begin(), high_level.end());
-  lod[1].assign(low_level.begin(), low_level.end());
-  if (!framework::CheckLoD(lod)) {
-    PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
-  }
-  selected_ids->set_lod(lod);
-  selected_scores->set_lod(lod);
-}
-void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
-                               std::vector<std::vector<Item>> *items) {
-  auto *pre_ids_data = pre_ids.data<int64_t>();
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
-  for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
-    size_t src_prefix_start = high_level[src_idx];
-    size_t src_prefix_end = high_level[src_idx + 1];
-    bool finish_flag = true;
-    for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
-      for (auto &item : items->at(offset)) {
-        if (item.id != static_cast<size_t>(end_id_) ||
-            pre_ids_data[offset] != end_id_) {
-          finish_flag = false;
-          break;
-        }
-      }
-      if (!finish_flag) break;
-    }
-    if (finish_flag) {  // all branchs of the beam (source sentence) end and
-                        // prune this beam
-      for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
-        items->at(offset).clear();
-    }
-  }
-}
-std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
-    const std::vector<std::vector<Item>> &items, size_t element_num) {
-  std::vector<std::vector<Item>> result;
-  result.resize(element_num);
-  for (auto &entries : items) {
-    for (const auto &item : entries) {
-      result[item.offset].push_back(item);
-    }
-  }
-  return result;
-}
-std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
-    const framework::LoDTensor &pre_ids,
-    const framework::LoDTensor &pre_scores) {
-  std::vector<std::vector<Item>> result;
-  std::vector<Item> items;
-  // for each source sentence, select the top beam_size items across all
-  // candidate sets.
-  while (NextItemSet(pre_ids, pre_scores, &items)) {
-    std::nth_element(
-        std::begin(items), std::begin(items) + beam_size_, std::end(items),
-        [](const Item &a, const Item &b) { return a.score > b.score; });
-    // prune the top beam_size items.
-    if (items.size() > beam_size_) {
-      items.resize(beam_size_);
-    }
-    result.emplace_back(items);
-  }
-  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
-  for (auto &items : result) {
-    VLOG(3) << "item set:";
-    for (auto &item : items) {
-      VLOG(3) << ItemToString(item);
-    }
-  }
-  return result;
-}
-// the candidates of a source
-bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
-                             const framework::LoDTensor &pre_scores,
-                             std::vector<BeamSearch::Item> *items) {
-  if (sent_offset_ >= ids_->NumElements(lod_level_)) {
-    return false;
-  }
-  // find the current candidates
-  auto ids = *ids_;
-  auto scores = *scores_;
-  auto abs_lod = framework::ToAbsOffset(ids.lod());
-  auto *ids_data = ids.data<int64_t>();
-  auto *scores_data = scores.data<float>();
-  size_t instance_dim = 1;
-  for (int i = 1; i < ids.dims().size(); i++) {
-    instance_dim *= ids.dims()[i];
-  }
-  auto *pre_ids_data = pre_ids.data<int64_t>();
-  auto *pre_scores_data = pre_scores.data<float>();
-  items->clear();
-  items->reserve(framework::product(ids.dims()));
-  for (size_t offset = abs_lod[lod_level_][sent_offset_];
-       offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    auto pre_id = pre_ids_data[offset];
-    auto pre_score = pre_scores_data[offset];
-    if (pre_id == end_id_) {
-      // Allocate all probability mass to eos_id for finished branchs and the
-      // other candidate ids can be ignored.
-      items->emplace_back(offset, end_id_, pre_score);
-    } else {
-      for (size_t d = 0; d < instance_dim; d++) {
-        const size_t dim_offset = offset * instance_dim + d;
-        items->emplace_back(offset, ids_data[dim_offset],
-                            scores_data[dim_offset]);
-      }
-    }
-  }
-  sent_offset_++;
-  return true;
-}
-std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
-  os << "{";
-  os << "offset: " << item.offset << ", ";
-  os << "id: " << item.id << ", ";
-  os << "score: " << item.score << "";
-  os << "}";
-  return os;
-}
-std::string ItemToString(const BeamSearch::Item &item) {
-  std::ostringstream stream;
-  stream << item;
-  return stream.str();
-}
 class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
@@ -219,18 +29,23 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
             "(LoDTensor) The LoDTensor containing the selected ids at the "
             "previous step. It should be a tensor with shape (batch_size, 1) "
             "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
-             "thefirst step.");
+             "the first step.");
    AddInput("pre_scores",
             "(LoDTensor) The LoDTensor containing the accumulated "
             "scores corresponding to the selected ids at the previous step.");
    AddInput("ids",
             "(LoDTensor) The LoDTensor containing the candidates ids. Its "
-             "shape should be (batch_size * beam_size, K), where K supposed to "
+             "shape should be (batch_size * beam_size, W). If not set, it will "
-             "be beam_size.");
+             "be calculated out according to Input(scores) in this operator.")
+        .AsDispensable();
    AddInput("scores",
-             "(LoDTensor) The LodTensor containing the accumulated scores "
+             "(LoDTensor) The LoDTensor containing the current scores "
-             "corresponding to Input(ids) and its shape is the same as the "
+             "corresponding to Input(ids). If Input(ids) is not nullptr, its "
-             "shape of Input(ids).");
+             "shape is the same as that of Input(ids)."
+             "If is_accumulated is true, Input(scores) is accumulated scores "
+             "and will be used derectedly. Else, each score will be "
+             "transformed to the log field and accumulate Input(pre_sores) "
+             "first.");
    AddOutput("selected_ids",
              "A LodTensor that stores the IDs selected by beam search.");
    AddOutput("selected_scores",
@@ -242,6 +57,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("beam_size", "beam size for beam search");
    AddAttr<int>("end_id",
                 "the token id which indicates the end of a sequence");
+    AddAttr<bool>("is_accumulated",
+                  "Whether the Input(scores) is accumulated scores.")
+        .SetDefault(true);
    AddComment(R"DOC(
 This operator does the search in beams for one time step. 
@@ -265,10 +83,9 @@ class BeamSearchOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
  void InferShape(framework::InferShapeContext *ctx) const override {
    for (const std::string &arg :
-         std::vector<std::string>({"pre_ids", "ids", "scores"})) {
+         std::vector<std::string>({"pre_ids", "scores"})) {
      PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'",
                     arg);
    }
@@ -279,12 +96,22 @@ class BeamSearchOp : public framework::OperatorWithKernel {
    }
  }
+ protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = framework::OpKernelType(
+    auto *scores = ctx.Input<framework::LoDTensor>("scores");
-        ctx.Input<framework::LoDTensor>("pre_ids")->type(),
+    size_t level = ctx.Attr<int>("level");
-        platform::CPUPlace());
+    size_t batch_size = scores->lod()[level].size() - 1;
-    return kt;
+    // The current CUDA kernel only support cases with batch_size < 4.
+    // Compute on CPU for cases with batch_size > 4.
+    if (batch_size <= 4) {
+      return framework::OpKernelType(
+          ctx.Input<framework::LoDTensor>("pre_ids")->type(), ctx.GetPlace());
+    } else {
+      return framework::OpKernelType(
+          ctx.Input<framework::LoDTensor>("pre_ids")->type(),
+          platform::CPUPlace());
+    }
  }
 };

--- a/paddle/fluid/operators/beam_search_op.cu.cc
+++ b/paddle/fluid/operators/beam_search_op.cu.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/beam_search_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    beam_search,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,187 +14,12 @@ limitations under the License. */
 #pragma once
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/beam_search.h"
 namespace paddle {
 namespace operators {
-/*
- * This is an implementation of beam search.
- *
- * To explain the details, lets take machine translation task for example, in
- * this task, one source sentence is translated to multiple target sentences,
- * during this period, one sentence will be translated to multiple translation
- * prefixes(target sentence that have not ended), in each time step a prefix
- * will have some candidates, input the candidate ids and their corresponding
- * scores (probabilities), it will sort and select the top beam_size candidates
- * for each source sentence, and store the selected candidates's score and their
- * corresponding ids to LoDTensors.
- *
- * A detailed example:
- *
- * Input
- *
- * ids:
- * LoD (should have 2 levels)
- * first level: [0, 1, 4]
- * second level: [0, 1, 2, 3, 4]
- *
- * tensor's data
- * [
- * [4, 2, 5]
- * [2, 1, 3]
- * [3, 5, 2]
- * [8, 2, 1]
- * ]
- *
- * scores:
- * LoD same as `ids`
- * tensor's data
- * [
- * [0.5, 0.3, 0.2]
- * [0.6, 0.3, 0.1]
- * [0.9, 0.5, 0.1]
- * [0.7, 0.5, 0.1]
- * ]
- *
- * the inputs means that there are 2 source sentences to translate, and the
- * first source has 1 prefix, the second source has 2 prefix.
- *
- * lets assume beam size is 2, and the beam search's output should be
- * LoD
- * first level:
- * [0, 1, 2]
- * second level:
- * [0, 2, 4]
- *
- * id tensor's data
- * [[
- * 4,
- * 1,
- * 3,
- * 8,
- * ]]
- *
- * score tensor's data
- * [[
- * 0.5,
- * 0.3,
- * 0.9,
- * 0.7
- * ]]
- *
- * TODO all the prune operations should be in the beam search, so it is better
- * to split the beam search algorithm into a sequence of smaller operators, and
- * the prune operators can be inserted in this sequence.
- */
-class BeamSearch {
- public:
-  // TODO(superjom) make type customizable
-  using id_t = size_t;
-  using score_t = float;
-  /*
-   * Input the arguments that needed by this class.
-   */
-  BeamSearch(const framework::LoDTensor& ids,
-             const framework::LoDTensor& scores, size_t level, size_t beam_size,
-             int end_id)
-      : beam_size_(beam_size),
-        ids_(&ids),
-        scores_(&scores),
-        lod_level_(level),
-        end_id_(end_id) {}
-  /*
-   * The main function of beam search.
-   *
-   * @selected_ids: a [None, 1]-shaped tensor with LoD.
-   *   In a machine translation model, it might be the candidate term id sets,
-   *   each set stored as a varience-length sequence.
-   *   The format might be described with a two-level LoD
-   *   - [[0 1]
-   *   -  [0 1 2]]
-   *   - [[]
-   *   -  [0 1]]
-   *   the first level of LoD tells that there are two source sentences. The
-   *   second level describes the details of the candidate id set's offsets in
-   * the
-   *   source sentences.
-   *
-   *  @selected_scores: a LoD tensor with the same shape and LoD with
-   * selected_ids.
-   *   It stores the corresponding scores of candidate ids in selected_ids.
-   *
-   * Return false if all the input tensor is empty, in machine translation task
-   * that means no candidates is provided, and the task will stop running.
-   */
-  void operator()(const framework::LoDTensor& pre_ids,
-                  const framework::LoDTensor& pre_scores,
-                  framework::LoDTensor* selected_ids,
-                  framework::LoDTensor* selected_scores);
-  /*
-   * The basic items help to sort.
-   */
-  struct Item {
-    Item() {}
-    Item(size_t offset, size_t id, float score)
-        : offset(offset), id(id), score(score) {}
-    // offset in the higher lod level.
-    size_t offset;
-    // // prefix id in the lower lod level.
-    // size_t prefix;
-    // the candidate id
-    id_t id;
-    // the corresponding score
-    score_t score;
-  };
- protected:
-  /*
-   * Prune the source sentences all branchs finished, and it is optional.
-   * Pruning must one step later than finishing (thus pre_ids is needed here),
-   * since the end tokens must be writed out.
-   */
-  void PruneEndBeams(const framework::LoDTensor& pre_ids,
-                     std::vector<std::vector<Item>>* items);
-  /*
-   * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance.
-   */
-  std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>>& inputs, size_t element_num);
-  /*
-   * For each source, select top beam_size records.
-   */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
-      const framework::LoDTensor& pre_ids,
-      const framework::LoDTensor& pre_scores);
-  /*
-   * Get the items of next source sequence, return false if no remaining items.
-   */
-  bool NextItemSet(const framework::LoDTensor& pre_ids,
-                   const framework::LoDTensor& pre_scores,
-                   std::vector<Item>* items);
- private:
-  size_t beam_size_;
-  const framework::LoDTensor* ids_;
-  const framework::LoDTensor* scores_;
-  size_t lod_level_{0};
-  size_t sent_offset_{0};
-  int end_id_{0};
-};
-std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
-std::string ItemToString(const BeamSearch::Item& item);
 template <typename DeviceContext, typename T>
 class BeamSearchOpKernel : public framework::OpKernel<T> {
 public:
@@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
    auto* scores = context.Input<framework::LoDTensor>("scores");
    auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
    auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
-    PADDLE_ENFORCE_NOT_NULL(ids);
    PADDLE_ENFORCE_NOT_NULL(scores);
    PADDLE_ENFORCE_NOT_NULL(pre_ids);
    PADDLE_ENFORCE_NOT_NULL(pre_scores);
@@ -211,14 +36,20 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
    size_t level = context.Attr<int>("level");
    size_t beam_size = context.Attr<int>("beam_size");
    int end_id = context.Attr<int>("end_id");
-    BeamSearch alg(*ids, *scores, level, beam_size, end_id);
+    bool is_accumulated = context.Attr<bool>("is_accumulated");
    auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
    auto selected_scores =
        context.Output<framework::LoDTensor>("selected_scores");
    PADDLE_ENFORCE_NOT_NULL(selected_ids);
    PADDLE_ENFORCE_NOT_NULL(selected_scores);
-    alg(*pre_ids, *pre_scores, selected_ids, selected_scores);
+    math::BeamSearchFunctor<DeviceContext, T> alg;
+    alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
+        ids, scores, selected_ids, selected_scores, level, beam_size, end_id,
+        is_accumulated);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include "paddle/fluid/operators/beam_search_op.h"
-#include <gtest/gtest.h>
-#include <vector>
-namespace paddle {
-namespace test {
-using std::vector;
-using framework::LoDTensor;
-using framework::LoD;
-using operators::BeamSearch;
-using paddle::platform::CPUPlace;
-using std::cout;
-using std::endl;
-void CreateInput(LoDTensor* ids, LoDTensor* scores) {
-  LoD lod;
-  vector<size_t> level0({0, 2, 4});
-  vector<size_t> level1({0, 1, 2, 3, 4});
-  lod.push_back(level0);
-  lod.push_back(level1);
-  ids->set_lod(lod);
-  scores->set_lod(lod);
-  auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
-  ids->Resize(dims);
-  scores->Resize(dims);
-  CPUPlace place;
-  auto* ids_data = ids->mutable_data<int64_t>(place);
-  auto* scores_data = scores->mutable_data<float>(place);
-  vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
-  vector<float> _scores(
-      {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
-  for (int i = 0; i < 12; i++) {
-    ids_data[i] = _ids[i];
-    scores_data[i] = _scores[i];
-  }
-}
-// It seems that beam_search_op has bugs.
-TEST(DISABLED_beam_search_op, run) {
-  CPUPlace place;
-  LoDTensor ids, scores;
-  CreateInput(&ids, &scores);
-  LoDTensor pre_ids;
-  pre_ids.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
-  for (int i = 0; i < 4; i++) {
-    pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
-  }
-  LoDTensor pre_scores;
-  pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
-  for (int i = 0; i < 4; i++) {
-    pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
-  }
-  BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
-  LoDTensor sids, sscores;
-  beamsearch(pre_ids, pre_scores, &sids, &sscores);
-  LOG(INFO) << "score: " << sscores << endl;
-  ASSERT_EQ(sids.lod(), sscores.lod());
-  vector<int> tids({4, 2, 3, 8});
-  vector<float> tscores({0.5f, 0.6f, 0.9f, 0.7f});
-  for (int i = 0; i < 4; i++) {
-    ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
-    ASSERT_EQ(tscores[i], sscores.data<float>()[i]);
-  }
-}
-}  // namespace test
-}  // namespace paddle
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel<T> {
    auto* label = ctx.Input<Tensor>("Label");
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    const int step_size = x->dims()[0];
+    const size_t step_size = static_cast<size_t>(x->dims()[0]);
-    const int num_classes = x->dims()[1];
+    const size_t num_classes = static_cast<size_t>(x->dims()[1]);
    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
    const T* dy_data = dy->data<T>();
    const T* x_data = x->data<T>();

--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -104,9 +104,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
    // ------------------- cudnn conv algorithm ---------------------
    cudnnConvolutionFwdAlgo_t algo;
    auto handle = dev_ctx.cudnn_handle();
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    Tensor cudnn_workspace;
-    void* cudnn_workspace_ptr = nullptr;
    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
        cudnn_conv_desc, CUDNN_DEFAULT_MATH));
@@ -120,24 +118,19 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
          workspace_size_limit, &algo));
      VLOG(3) << "cuDNN forward algo " << algo;
    } else {
-      cudnn_workspace =
-          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
-              framework::make_ddim(
-                  {static_cast<int64_t>(workspace_size_limit)}),
-              dev_ctx);
-      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
      auto search_func = [&]() {
        int returned_algo_count;
        std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
            fwd_perf_stat;
+        auto cudnn_find_func = [&](void* cudnn_workspace) {
-        CUDNN_ENFORCE(platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+          CUDNN_ENFORCE(
-            handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+              platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-            filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
+                  handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-            kNUM_CUDNN_FWD_ALGS, &returned_algo_count, fwd_perf_stat.data(),
+                  filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
-            cudnn_workspace_ptr, workspace_size_limit));
+                  kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
+                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
+        };
+        workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
        VLOG(3) << "Perf result: (algo: stat, time, memory)";
        for (int i = 0; i < returned_algo_count; ++i) {
          const auto& stat = fwd_perf_stat[i];
@@ -188,15 +181,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                      "workspace_size to be allocated exceeds the limit");
-    if (!cudnn_workspace_ptr) {
-      cudnn_workspace =
-          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
-              framework::make_ddim(
-                  {static_cast<int64_t>(workspace_size_in_bytes)}),
-              dev_ctx);
-      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
-    }
    if ((activation == "identity") && (!residual)) {
      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
@@ -204,12 +188,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
      // cudnnConvolutionForward and cudnnAddTensor
      // ------------- cudnn conv forward and bias add ---------------------
      ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-          handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
-          filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
-          workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+            workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
      CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
          handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
          output_data));
@@ -220,13 +205,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
      // ------------------- cudnn conv+bias+act forward --------------------
      ScalingParamType<T> alpha1 = 1.0f;
      ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
-          handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
+            handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
-          filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
-          workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
+            workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
-          cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+            cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
-          output_data));
+            output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
    }
    std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
    if (channels.size()) {

--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -104,18 +104,16 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
    int output_offset = output->numel() / output->dims()[0] / groups;
    int filter_offset = filter->numel() / groups;
    T alpha = 1.0f, beta = 0.0f;
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    auto temp_allocation =
-        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
-            workspace_size_in_bytes);
-    void* cudnn_workspace = temp_allocation->ptr();
    for (int g = 0; g < groups; g++) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+      auto cudnn_func = [&](void* cudnn_workspace) {
-          handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-          cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
+            handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
-          algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+            cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
-          cudnn_output_desc, output_data + output_offset * g));
+            algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+            cudnn_output_desc, output_data + output_offset * g));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
    }
  }
 };
@@ -211,22 +209,20 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
        output_grad->numel() / output_grad->dims()[0] / groups;
    int filter_offset = filter->numel() / groups;
    T alpha = 1.0f, beta = 0.0f;
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    auto temp_allocation =
-        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
-            workspace_size_in_bytes);
-    void* cudnn_workspace = temp_allocation->ptr();
    if (input_grad) {
      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
      // Because beta is zero, it is unnecessary to reset input_grad.
      for (int g = 0; g < groups; g++) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+        auto cudnn_func = [&](void* cudnn_workspace) {
-            handle, &alpha, cudnn_output_desc,
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-            output_grad_data + output_grad_offset * g, cudnn_filter_desc,
+              handle, &alpha, cudnn_output_desc,
-            filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
+              output_grad_data + output_grad_offset * g, cudnn_filter_desc,
-            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+              filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
-            input_grad_data + input_offset * g));
+              cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+              input_grad_data + input_offset * g));
+        };
+        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
      }
    }
@@ -236,12 +232,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
      // Because beta is zero, it is unnecessary to reset filter_grad.
      // Gradient with respect to the filter
      for (int g = 0; g < groups; g++) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+        auto cudnn_func = [&](void* cudnn_workspace) {
-            handle, &alpha, cudnn_output_desc,
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-            output_grad_data + output_grad_offset * g, cudnn_input_desc,
+              handle, &alpha, cudnn_output_desc,
-            input_data + input_offset * g, cudnn_conv_desc, filter_algo,
+              output_grad_data + output_grad_offset * g, cudnn_input_desc,
-            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
+              input_data + input_offset * g, cudnn_conv_desc, filter_algo,
-            filter_grad_data + filter_offset * g));
+              cudnn_workspace, workspace_size_in_bytes, &beta,
+              cudnn_filter_desc, filter_grad_data + filter_offset * g));
+        };
+        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
      }
    }
  }

--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WITH_GRPC)
        collective_client.cc collective_server.cc
        ${GRPC_SRCS}
      PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory)
+      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS})
  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
  set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
@@ -32,15 +32,17 @@ else()
  set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib)
  brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
      request_handler_impl.cc rpc_client.cc rpc_server.cc
      variable_response.cc
      collective_client.cc collective_server.cc
      ${BRPC_SRCS}
    PROTO send_recv.proto
-    DEPS lod_tensor selected_rows memory)
+    DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS})
-  set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib)
+  set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS})
  cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
      DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL)
 endif()

--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@@ -62,7 +62,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
  const std::string var_name_val = var_name;
  const framework::Scope* p_scope = &scope;
  const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = "SendRPC";
+  const std::string method = kSendRPC;
  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
  framework::AsyncIO([=] {
@@ -156,15 +156,18 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
                                      const platform::DeviceContext& ctx,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
+                                      const std::string& out_var_name,
                                      const std::string& method_name,
                                      int64_t time_out) {
  const platform::DeviceContext* p_ctx = &ctx;
  const std::string ep_val = ep;
  const std::string var_name_val = var_name;
+  const std::string out_varname_val = out_var_name;
  const framework::Scope* p_scope = &scope;
  const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = "GetRPC";
+  const std::string method = kGetRPC;
-  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+  VarHandlePtr var_h(
+      new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
  framework::AsyncIO([=] {
    auto ch_ctx = ch_ptr->Pop();
@@ -175,6 +178,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
    sendrecv::VariableMessage req;
    req.set_varname(var_name_val);
+    req.set_out_varname(out_varname_val);
    req.set_trainer_id(trainer_id_);
    google::protobuf::Closure* done = brpc::NewCallback(
@@ -182,8 +186,10 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
    platform::RecordRPCEvent record_event(method, p_ctx);
-    if (method_name == "GetMonomerVariable") {
+    if (method_name == kGetMonomerRPC) {
      ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
+    } else if (method_name == kGetNoBarrierRPC) {
+      ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done);
    } else {
      ch_ctx->stub->GetVariable(cntl, &req, response, done);
    }
@@ -198,25 +204,39 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
  return var_h;
 }
+VarHandlePtr BRPCClient::AsyncGetVarNoBarrier(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    const std::string& out_var_name, int64_t time_out) {
+  std::string var_name_no_barrier =
+      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
+  return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name,
+                      kGetNoBarrierRPC, time_out);
+}
 VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
    const std::string& ep, const platform::DeviceContext& ctx,
    const framework::Scope& scope, const std::string& var_name,
    int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out);
+  return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC,
+                      time_out);
 }
 VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
                                                const std::string& var_name,
                                                int64_t time_out) {
-  return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out);
+  return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out);
 }
 VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
                                     const platform::DeviceContext& ctx,
                                     const framework::Scope& scope,
                                     const std::string& var_name,
+                                     const std::string& out_var_name,
                                     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out);
+  return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
+                      time_out);
 }
 VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
@@ -234,7 +254,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
  const framework::Scope* p_scope = &scope;
  const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = "PrefetchRPC";
+  const std::string method = kPrefetchRPC;
  VarHandlePtr var_h(
      new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
@@ -270,7 +290,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
 VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
                                               int64_t time_out) {
-  return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE,
+  return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE,
                          time_out);
 }
@@ -286,7 +306,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
  sendrecv::VariableMessage req;
  req.set_varname(FETCH_BARRIER_MESSAGE);
-  const std::string method = "FetchBarrierRPC";
+  const std::string method = kFetchBarrierRPC;
  // var handle
  VarHandlePtr var_h(
      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
@@ -367,7 +387,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
 VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
                                           int64_t time_out) {
-  return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out);
+  return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out);
 }
 void BRPCClient::SendComplete() {
@@ -394,9 +414,9 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
  google::protobuf::Closure* done = brpc::NewCallback(
      &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-  if (method_name == "CheckPointNotifyRPC") {
+  if (method_name == kCheckPointNotifyRPC) {
    ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
-  } else if (method_name == "GetMonomerBarrier") {
+  } else if (method_name == kSendMonomerFetchBarrierRPC) {
    ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
  } else {
    ch_ctx->stub->SendVariable(cntl, &req, response, done);

--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h
@@ -65,6 +65,7 @@ class BRPCClient : public RPCClient {
                           const platform::DeviceContext& ctx,
                           const framework::Scope& scope,
                           const std::string& var_name,
+                           const std::string& out_var_name,
                           int64_t time_out = FLAGS_rpc_deadline) override;
  VarHandlePtr AsyncGetMonomerBarrier(
@@ -76,6 +77,13 @@ class BRPCClient : public RPCClient {
      const framework::Scope& scope, const std::string& var_name,
      int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep,
+                                    const platform::DeviceContext& ctx,
+                                    const framework::Scope& scope,
+                                    const std::string& var_name,
+                                    const std::string& out_varname,
+                                    int64_t time_out = FLAGS_rpc_deadline);
  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
                                const platform::DeviceContext& ctx,
                                const framework::Scope& scope,
@@ -103,6 +111,7 @@ class BRPCClient : public RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
+                            const std::string& out_var_name,
                            const std::string& method_name,
                            int64_t time_out = FLAGS_rpc_deadline);

--- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
@@ -45,6 +45,13 @@ class BRPCServiceImpl : public SendRecvService {
          rpc_server_->GetThreadNum(distributed::kRequestGet)));
    }
+    it = rpc_call_map.find(distributed::kRequestGetNoBarrier);
+    if (it != rpc_call_map.end()) {
+      request_getnobarrier_h_ = it->second;
+      getnobarrier_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier)));
+    }
    it = rpc_call_map.find(distributed::kRequestPrefetch);
    if (it != rpc_call_map.end()) {
      request_prefetch_h_ = it->second;
@@ -112,6 +119,14 @@ class BRPCServiceImpl : public SendRecvService {
        [=] { _GetVariable(cntl_butil, request, response, done); });
  }
+  void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
+                            const VariableMessage* request,
+                            VariableMessage* response,
+                            google::protobuf::Closure* done) override {
+    getnobarrier_threads_->Run(
+        [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); });
+  }
  void _GetVariable(google::protobuf::RpcController* cntl_butil,
                    const VariableMessage* request, VariableMessage* response,
                    google::protobuf::Closure* done) {
@@ -122,23 +137,59 @@ class BRPCServiceImpl : public SendRecvService {
    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
    std::string varname = request->varname();
+    std::string out_varname = request->out_varname();
    VLOG(3) << "RequestGet varname:" << varname
+            << ", out_varname:" << out_varname
            << ", trainer_id:" << request->trainer_id()
            << ", from:" << cntl->remote_side();
    auto scope = request_get_h_->scope();
-    auto invar = scope->FindVar(varname);
+    paddle::framework::Variable* invar = nullptr;
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = nullptr;
+    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id,
+                           out_varname);
+    if (outvar) {
+      distributed::SerializeToIOBuf(out_varname, outvar,
+                                    *request_get_h_->dev_ctx(), response,
+                                    &cntl->response_attachment(), "", false);
+    }
+  }
+  void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
+                             const VariableMessage* request,
+                             VariableMessage* response,
+                             google::protobuf::Closure* done) {
+    PADDLE_ENFORCE(request_getnobarrier_h_ != nullptr,
+                   "RequestGetNoBarrier handler should be registed first!");
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+    std::string varname = request->varname();
+    std::string out_varname = request->out_varname();
    int trainer_id = request->trainer_id();
+    VLOG(3) << "RequestGetNoBarrier varname:" << varname
+            << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id
+            << ", from:" << cntl->remote_side();
+    auto scope = request_getnobarrier_h_->scope();
+    paddle::framework::Variable* invar = nullptr;
    paddle::framework::Variable* outvar = nullptr;
-    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id);
+    request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id,
+                                    out_varname);
    if (outvar) {
-      distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(),
+      distributed::SerializeToIOBuf(
-                                    response, &cntl->response_attachment(), "",
+          out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response,
-                                    false);
+          &cntl->response_attachment(), "", false);
    }
  }
  void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
                        const VariableMessage* request,
                        VariableMessage* response,
@@ -282,6 +333,7 @@ class BRPCServiceImpl : public SendRecvService {
 private:
  distributed::RequestHandler* request_send_h_{nullptr};
  distributed::RequestHandler* request_get_h_{nullptr};
+  distributed::RequestHandler* request_getnobarrier_h_{nullptr};
  distributed::RequestHandler* request_prefetch_h_{nullptr};
  distributed::RequestHandler* request_checkpoint_h_{nullptr};
  distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
@@ -289,9 +341,10 @@ class BRPCServiceImpl : public SendRecvService {
  distributed::RPCServer* rpc_server_{nullptr};
-  // FIXME(gongwb): brpc should support process one rpce use one threadpool.
+  // FIXME(gongwb): brpc should support process one rpc use one threadpool.
  std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
  std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> getnobarrier_threads_;
  std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
  std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
 };

--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -54,9 +54,20 @@ bool RequestSendHandler::Handle(const std::string& varname,
    // Async
    if (!sync_mode_) {
      VLOG(3) << "async process var: " << varname;
-      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+      if (varname == BATCH_BARRIER_MESSAGE) {
-                                    scope);
+        PADDLE_THROW(
-      delete scope;
+            "async mode should not recv BATCH_BARRIER_MESSAGE or "
+            "COMPLETE_MESSAGE");
+      }
+      try {
+        executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                      scope);
+        delete scope;
+      } catch (std::exception& e) {
+        LOG(ERROR) << "async: run sub program error " << e.what();
+        return false;
+      }
      return true;
    } else {  // sync
      rpc_server_->WaitCond(kRequestSend);

--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -39,27 +39,33 @@ void RPCServer::SavePort() const {
  port_file.open(file_path);
  port_file << selected_port_;
  port_file.close();
-  VLOG(4) << "selected port written to " << file_path;
+  VLOG(3) << "selected port written to " << file_path;
 }
 void RPCServer::WaitBarrier(const std::string& rpc_name) {
+  VLOG(3) << "WaitBarrier in: " << rpc_name;
  std::unique_lock<std::mutex> lock(this->mutex_);
  barrier_cond_.wait(lock, [this, &rpc_name] {
    return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
            exit_flag_.load());
  });
-  VLOG(3) << "batch_barrier_: " << rpc_name << " "
+  VLOG(3) << "WaitBarrier out: " << rpc_name
-          << barrier_counter_[rpc_name];
+          << " counter: " << barrier_counter_[rpc_name];
 }
 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  // barrier msg should make sure that it's in the right cond(send|recv)
+  WaitCond(rpc_name);
  int b = 0;
  std::unique_lock<std::mutex> lock(mutex_);
  b = ++barrier_counter_[rpc_name];
+  VLOG(3) << rpc_name << " barrier_counter: " << b;
  if (b >= client_num_) {
    lock.unlock();
+    VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for "
+            << rpc_name;
    barrier_cond_.notify_all();
    lock.lock();
  }
@@ -71,7 +77,7 @@ void RPCServer::Complete() {
    client_num_--;
    need_reset_all_vars_ = true;
-    VLOG(4) << "decrease client_num to: " << client_num_;
+    VLOG(3) << "decrease client_num to: " << client_num_;
    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
      barrier_counter_[kRequestGet]--;
    }
@@ -105,8 +111,8 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
  static int cond = -1;
  rpc_cond_map_[rpc_name] = ++cond;
-  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
+  VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler
-          << ", cond:" << rpc_cond_map_[rpc_name];
+          << ", cond: " << rpc_cond_map_[rpc_name];
 }
 void RPCServer::SetCond(const std::string& rpc_name) {
@@ -120,7 +126,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
 }
 void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(4) << "RPCServer WaitCond " << rpc_name;
+  VLOG(3) << "RPCServer WaitCond in " << rpc_name;
  int cond = 0;
  {
    std::unique_lock<std::mutex> lock(mutex_);
@@ -130,6 +136,7 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
  std::unique_lock<std::mutex> lock(mutex_);
  rpc_cond_.wait(
      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
+  VLOG(3) << "RPCServer WaitCond out " << rpc_name;
 }
 void RPCServer::RegisterVar(const std::string& var_name,
@@ -151,7 +158,7 @@ void RPCServer::RegisterVar(const std::string& var_name,
  }
  rpc_cond_.notify_all();
-  VLOG(4) << "RegisterVar context:" << h.String();
+  VLOG(3) << "RegisterVar context:" << h.String();
 }
 void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
@@ -167,11 +174,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
    barrier_cond_.notify_all();
  }
-  VLOG(4) << "IncreaseVarBarrier context:" << h.String();
+  VLOG(3) << "IncreaseVarBarrier context:" << h.String();
 }
 void RPCServer::WaitVarBarrier(const std::string& var_name) {
-  VLOG(4) << "WaitBarrier var_name:" << var_name;
+  VLOG(3) << "WaitVarBarrier var_name:" << var_name;
  std::unique_lock<std::mutex> lock(mutex_);
  barrier_cond_.wait(lock, [&]() {
@@ -179,11 +186,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) {
            exit_flag_.load());
  });
-  VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String();
+  VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String();
 }
 void RPCServer::SetVarCond(const std::string& var_name) {
-  VLOG(4) << "SetVarCond var_name:" << var_name;
+  VLOG(3) << "SetVarCond var_name:" << var_name;
  {
    std::unique_lock<std::mutex> lock(mutex_);
    if (var_map_.find(var_name) != var_map_.end()) {
@@ -193,14 +200,14 @@ void RPCServer::SetVarCond(const std::string& var_name) {
 }
 void RPCServer::WaitVarCond(const std::string& var_name) {
-  VLOG(4) << "WaitVarCond var_name:" << var_name;
+  VLOG(3) << "WaitVarCond var_name:" << var_name;
  std::unique_lock<std::mutex> lock(mutex_);
  rpc_cond_.wait(lock, [=] {
    return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
  });
-  VLOG(4) << "WaitVarCond var_name:" << var_name << " end";
+  VLOG(3) << "WaitVarCond var_name:" << var_name << " end";
 }
 MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {

--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -117,8 +117,9 @@ bool VariableResponse::CopyLodTensorData(
      tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type()));
  VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
-          << ", Buffer Size = " << length;
+          << ", Buffer Size = " << length << ", dims:" << dims
-  PADDLE_ENFORCE_EQ(tensor->memory_size(), static_cast<unsigned int>(length));
+          << ", numel:" << tensor->numel();
+  PADDLE_ENFORCE_GE(tensor->memory_size(), static_cast<unsigned int>(length));
  return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
 }

--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop(
  while (true) {
    // Get from multiple trainers, we don't care about the order in which
    // the gradients arrives, just add suffix 0~n and merge the gradient.
+    VLOG(3) << "wait all clients to send gradient";
    rpc_service_->SetCond(distributed::kRequestSend);
+    VLOG(3) << "wait all clients to send send_barrier";
    rpc_service_->WaitBarrier(distributed::kRequestSend);
    if (rpc_service_->IsExit()) {
@@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop(
    }
    ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
                          recv_scope);
-    VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
+    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
+    VLOG(3) << "ResetReceivedVars";
    ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
+    VLOG(3) << "wait all clients to get parameters back";
    rpc_service_->SetCond(distributed::kRequestGet);
+    VLOG(3) << "wait all clients to send fetch_barrier";
    rpc_service_->WaitBarrier(distributed::kRequestGet);
+    VLOG(3) << "ResetBarrierCounter";
    rpc_service_->ResetBarrierCounter();
  }  // while(true)
 }

--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
@@ -43,9 +43,9 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
                      "the number of Ids and Out should be the same");
-    size_t row_ids_size = 0;
+    int64_t row_ids_size = 0;
-    int row_size = 0;
+    int64_t row_size = 0;
-    int embedding_size = 0;
+    int64_t embedding_size = 0;
    for (size_t i = 0; i < x_tensors.size(); ++i) {
      const auto *x_tensor = x_tensors[i];
@@ -69,7 +69,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
    for (size_t i = 0; i < x_tensors.size(); ++i) {
      const auto *row_id = row_ids[i];
-      for (int j = 0; j < row_id->numel(); ++j) {
+      for (auto j = 0; j < row_id->numel(); ++j) {
        int64_t key = row_id->data<int64_t>()[j];
        std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
        selected_rows_idx_map.insert(std::make_pair(key, val));
@@ -84,13 +84,13 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
      out->set_lod(out_ids->lod());
-      int nums = static_cast<int>(out_ids->dims()[0]);
+      auto nums = out_ids->dims()[0];
      auto *out_data = out->mutable_data<T>(
          framework::make_ddim({nums, embedding_size}), place);
-      for (int j = 0; j < nums; ++j) {
+      for (auto j = 0; j < nums; ++j) {
-        int id = out_ids->data<int64_t>()[j];
+        auto id = out_ids->data<int64_t>()[j];
-        auto row_tuple = selected_rows_idx_map[id];
+        auto row_tuple = selected_rows_idx_map.at(id);
-        int64_t row_idx = std::get<1>(row_tuple);
+        auto row_idx = std::get<1>(row_tuple);
        const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
        memcpy(out_data + embedding_size * j,

--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -277,68 +277,6 @@ class TransformFunctor {
  Functor func_;
 };
-#define EIGEN_FUNCTOR(name, eigen_op)                                          \
-  struct Eigen##name##Functor {                                                \
-    template <typename DeviceContext, typename T>                              \
-    inline void Run(const framework::Tensor *x, const framework::Tensor *y,    \
-                    framework::Tensor *z,                                      \
-                    const framework::ExecutionContext &ctx) {                  \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_e);                                                  \
-    }                                                                          \
-    template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast(const framework::Tensor *x,                       \
-                             const framework::Tensor *y, framework::Tensor *z, \
-                             const framework::ExecutionContext &ctx, int pre,  \
-                             int n) {                                          \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_bcast);                                              \
-    }                                                                          \
-    template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast2(const framework::Tensor *x,                      \
-                              const framework::Tensor *y,                      \
-                              framework::Tensor *z,                            \
-                              const framework::ExecutionContext &ctx, int pre, \
-                              int n, int post) {                               \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_bcast);                                              \
-    }                                                                          \
-  }
-#define EIGEN_ADD(x, y) ((x) + (y))
-EIGEN_FUNCTOR(Add, EIGEN_ADD);
-#define EIGEN_SUB(x, y) ((x) - (y))
-EIGEN_FUNCTOR(Sub, EIGEN_SUB);
-#define EIGEN_MUL(x, y) ((x) * (y))
-EIGEN_FUNCTOR(Mul, EIGEN_MUL);
-#define EIGEN_DIV(x, y) ((x) / (y))
-EIGEN_FUNCTOR(Div, EIGEN_DIV);
 template <typename T, typename DX_OP, typename DY_OP>
 struct ElemwiseGradNoBroadcast {
  const T *x_;

--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -216,19 +216,18 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
    out_datas.push_back(
        static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
-    auto temp_allocation =
-        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
-            workspace_size_in_bytes);
-    void* cudnn_workspace = temp_allocation->ptr();
    for (int i = 0; i < 4; ++i) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+      auto func = [&](void* cudnn_workspace) {
-          handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
-          static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
+            handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
-          algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, out_desc[i],
+            static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
-          out_datas[i], bias_desc[i],
+            algo[i], cudnn_workspace, workspace_size_in_bytes, &beta,
-          static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
+            out_desc[i], out_datas[i], bias_desc[i],
-          out_desc[i], out_datas[i]));
+            static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
+            out_desc[i], out_datas[i]));
+      };
+      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+      workspace_handle.RunFunc(func, workspace_size_in_bytes);
    }
    cudnnTensorDescriptor_t x_desc;

--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -43,12 +43,14 @@ class GridSampleOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
    PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
                      "Input(X) and Input(Grid) dims[0] should be equal.");
-    PADDLE_ENFORCE_EQ(
+    if (ctx->IsRuntime()) {
-        grid_dims[1], x_dims[2],
+      PADDLE_ENFORCE_EQ(
-        "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
+          grid_dims[1], x_dims[2],
-    PADDLE_ENFORCE_EQ(
+          "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
-        grid_dims[2], x_dims[3],
+      PADDLE_ENFORCE_EQ(
-        "Input(X) dims[3] and Input(Grid) dims[2] should be equal.");
+          grid_dims[2], x_dims[3],
+          "Input(X) dims[3] and Input(Grid) dims[2] should be equal.");
+    }
    ctx->SetOutputDim("Output", x_dims);
    ctx->ShareLoD("X", "Output");

--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -21,5 +21,5 @@ endif()
 cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
 cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
 if(NOT WIN32)
-    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer)
+    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer tensor)
 endif()
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -18,6 +18,7 @@
 #include <vector>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/place.h"
@@ -155,14 +156,22 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
  LOG(INFO) << loginfos.str();
 }
+using Tensor = paddle::framework::Tensor;
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void BenchXYZNKernel() {
  for (int d : TestSizes()) {
-    std::vector<T> x(d), y(d), z(d);
+    Tensor x, y, z;
-    RandomVec<T>(d, x.data());
+    x.Resize({d});
-    RandomVec<T>(d, y.data());
+    y.Resize({d});
-    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data(), y.data(),
+    z.Resize({d});
-                                                     z.data(), d);
+    T* x_data = x.mutable_data<T>(PlaceType());
+    T* y_data = y.mutable_data<T>(PlaceType());
+    T* z_data = z.mutable_data<T>(PlaceType());
+    RandomVec<T>(d, x_data);
+    RandomVec<T>(d, y_data);
+    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(),
+                                                     y.data<T>(), z_data, d);
  }
 }
@@ -170,9 +179,13 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void BenchAXYNKernel() {
  for (int d : TestSizes()) {
    const T a = static_cast<T>(3);
-    std::vector<T> x(d), y(d);
+    Tensor x, y;
-    RandomVec<T>(d, x.data());
+    x.Resize({d});
-    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data(), y.data(),
+    y.Resize({d});
+    T* x_data = x.mutable_data<T>(PlaceType());
+    T* y_data = y.mutable_data<T>(PlaceType());
+    RandomVec<T>(d, x_data);
+    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data,
                                                     d);
  }
 }
@@ -180,9 +193,13 @@ void BenchAXYNKernel() {
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void BenchXYNKernel() {
  for (int d : TestSizes()) {
-    std::vector<T> x(d), y(d);
+    Tensor x, y;
-    RandomVec<T>(d, x.data());
+    x.Resize({d});
-    BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data(), y.data(), d);
+    y.Resize({d});
+    T* x_data = x.mutable_data<T>(PlaceType());
+    T* y_data = y.mutable_data<T>(PlaceType());
+    RandomVec<T>(d, x_data);
+    BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data<T>(), y_data, d);
  }
 }
@@ -192,16 +209,23 @@ void BenchLSTMKernel() {
    for (int d : TestSizes()) {
      const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh,
                                  use_peephole);
-      std::vector<T> x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d);
+      Tensor x, ct_1, ct, ht, wp, checked;
-      RandomVec<T>(4 * d, x.data(), -2.f, 2.f);
+      x.Resize({4 * d});
-      RandomVec<T>(3 * d, wp.data(), -2.f, 2.f);
+      ct_1.Resize({d});
-      RandomVec<T>(d, ct_1.data(), -2.f, 2.f);
+      ct.Resize({d});
-      const T* ct_1_data = ct_1.data();
+      ht.Resize({d});
-      const T* wp_data = wp.data();
+      wp.Resize({3 * d});
-      T* x_data = x.data();
+      checked.Resize({2 * d});
-      T* checked_data = checked.data();
+      auto place = PlaceType();
-      T* ct_data = ct.data();
+      RandomVec<T>(x.numel(), x.mutable_data<T>(place), -2.f, 2.f);
-      T* ht_data = ht.data();
+      RandomVec<T>(wp.numel(), wp.mutable_data<T>(place), -2.f, 2.f);
+      RandomVec<T>(ct_1.numel(), ct_1.mutable_data<T>(place), -2.f, 2.f);
+      const T* ct_1_data = ct_1.data<T>();
+      const T* wp_data = wp.data<T>();
+      T* x_data = x.mutable_data<T>(place);
+      T* checked_data = checked.mutable_data<T>(place);
+      T* ct_data = ct.mutable_data<T>(place);
+      T* ht_data = ht.mutable_data<T>(place);
      jit::lstm_t step;
      step.gates = x_data;
      step.ct_1 = ct_1_data;
@@ -220,12 +244,16 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void BenchGRUKernel() {
  for (int d : TestSizes()) {
    const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
-    std::vector<T> x(3 * d), ht_1(d), ht(d);
+    auto place = PlaceType();
-    RandomVec<T>(3 * d, x.data(), -2.f, 2.f);
+    Tensor x, ht_1, ht;
-    RandomVec<T>(d, ht_1.data(), -2.f, 2.f);
+    x.Resize({3 * d});
-    const T* ht_1_data = ht_1.data();
+    ht_1.Resize({d});
-    T* x_data = x.data();
+    ht.Resize({d});
-    T* ht_data = ht.data();
+    RandomVec<T>(3 * d, x.mutable_data<T>(place), -2.f, 2.f);
+    RandomVec<T>(d, ht_1.mutable_data<T>(place), -2.f, 2.f);
+    const T* ht_1_data = ht_1.data<T>();
+    T* x_data = x.mutable_data<T>(place);
+    T* ht_data = ht.mutable_data<T>(place);
    jit::gru_t step;
    step.gates = x_data;
    step.ht_1 = ht_1_data;
@@ -243,10 +271,12 @@ void BenchSeqPoolKernel() {
      jit::seq_pool_attr_t attr(w, type);
      for (int h : TestSizes()) {
        attr.h = h;
-        std::vector<T> x(h * w), y(w);
+        Tensor x, y;
-        RandomVec<T>(h * w, x.data(), -2.f, 2.f);
+        x.Resize({h * w});
-        const T* x_data = x.data();
+        y.Resize({w});
-        T* y_data = y.data();
+        RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        const T* x_data = x.data<T>();
+        T* y_data = y.mutable_data<T>(PlaceType());
        BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
                                                            y_data, &attr);
      }
@@ -259,12 +289,15 @@ void BenchMatMulKernel() {
  for (int m : {1, 2, 3, 4}) {
    for (int n : TestSizes()) {
      for (int k : TestSizes()) {
-        std::vector<T> a(m * k), b(k * n), c(m * n);
+        Tensor a, b, c;
-        RandomVec<T>(m * k, a.data(), -2.f, 2.f);
+        a.Resize({m * k});
-        RandomVec<T>(k * n, b.data(), -2.f, 2.f);
+        b.Resize({k * n});
-        const T* a_data = a.data();
+        c.Resize({m * n});
-        const T* b_data = b.data();
+        RandomVec<T>(m * k, a.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        T* c_data = c.data();
+        RandomVec<T>(k * n, b.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        const T* a_data = a.data<T>();
+        const T* b_data = b.data<T>();
+        T* c_data = c.mutable_data<T>(PlaceType());
        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data,
                                                           c_data, m, n, k);
      }

--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -67,7 +67,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    mid->mutable_data<T>(ctx.GetPlace());
    const int n = ctx.Attr<int>("n");
-    const float alpha = ctx.Attr<float>("alpha");
+    // MKL-DNN implements LRN in a caffe way:
+    // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html
+    // Where sum of squares is divided by size of normalization window
+    // this is not the case for PaddlePaddle LRN.
+    // Hence we need to compensate for this diffrence by
+    // multipliing alpha by size of window(n)
+    const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
    const float beta = ctx.Attr<float>("beta");
    const float k = ctx.Attr<float>("k");
    const bool is_test = ctx.Attr<bool>("is_test");
@@ -78,10 +84,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto dims = paddle::framework::vectorize2int(x->dims());
    auto src_md = paddle::platform::MKLDNNMemDesc(
-        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+        dims, mkldnn::memory::data_type::f32, x->format());
-    auto dst_md = paddle::platform::MKLDNNMemDesc(
-        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
    auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
                                                  mkldnn::lrn_across_channels,
@@ -92,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                                  k};
    auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
-    auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
-                                     static_cast<void*>(output_data)};
    if (!is_test) {
      const std::string key = ctx.op().Output("Out");
@@ -110,11 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      src_memory->set_data_handle(
          static_cast<void*>(const_cast<T*>(input_data)));
+      auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(),
+                                       static_cast<void*>(output_data));
      auto workspace_memory = insert_to_context<mkldnn::memory>(
          key_workspace_memory, dev_ctx,
          forward_pd->workspace_primitive_desc());
      run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
+      out->set_layout(framework::DataLayout::kMKLDNN);
+      out->set_format(platform::GetMKLDNNFormat(dst_memory));
    } else {
      auto forward_pd =
          mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
@@ -122,8 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
          src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
      auto workspace_memory =
          mkldnn::memory{forward_pd.workspace_primitive_desc()};
+      auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(),
+                                       static_cast<void*>(output_data));
      run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
+      out->set_layout(framework::DataLayout::kMKLDNN);
+      out->set_format(platform::GetMKLDNNFormat(dst_memory));
    }
  }
 };
@@ -151,7 +162,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    const std::string key_workspace_memory = key + "@lrn_workspace_memory";
    const int n = ctx.Attr<int>("n");
-    const float alpha = ctx.Attr<float>("alpha");
+    const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
    const float beta = ctx.Attr<float>("beta");
    const float k = ctx.Attr<float>("k");

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -54,6 +54,7 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+math_library(beam_search DEPS math_function)
 math_library(matrix_bit_code)
@@ -68,6 +69,7 @@ cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
 cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
+cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
 if(WITH_GPU)
    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)

--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/math/beam_search.h"
+#include <algorithm>
+#include <map>
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+class BeamSearchFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext &context,
+                  const framework::LoDTensor *pre_ids,
+                  const framework::LoDTensor *pre_scores,
+                  const framework::LoDTensor *ids,
+                  const framework::LoDTensor *scores,
+                  framework::LoDTensor *selected_ids,
+                  framework::LoDTensor *selected_scores, size_t level,
+                  size_t beam_size, int end_id, bool is_accumulated) {
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+    auto &high_level = abs_lod[level];
+    auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level,
+                                        beam_size, end_id, is_accumulated);
+    auto selected_items = ToMap(items, high_level.back());
+    if (FLAGS_v == 3) {
+      VLOG(3) << "selected_items:";
+      for (size_t i = 0; i < selected_items.size(); ++i) {
+        VLOG(3) << "offset: " << i;
+        for (auto &item : selected_items[i]) {
+          VLOG(3) << item.ToString();
+        }
+      }
+    }
+    PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
+    // calculate the output tensor's height
+    size_t num_instances = std::accumulate(
+        std::begin(selected_items), std::end(selected_items), 0,
+        [](size_t a, std::vector<Item> &b) { return a + b.size(); });
+    // the output tensor shape should be [num_instances, 1]
+    auto dims = framework::make_ddim(
+        std::vector<int64_t>({static_cast<int>(num_instances), 1}));
+    selected_ids->Resize(dims);
+    selected_scores->Resize(dims);
+    auto *selected_ids_data =
+        selected_ids->mutable_data<int64_t>(platform::CPUPlace());
+    auto *selected_scores_data =
+        selected_scores->mutable_data<float>(platform::CPUPlace());
+    // fill in data
+    std::vector<size_t> low_level;
+    size_t low_offset = 0;
+    for (auto &items : selected_items) {
+      low_level.push_back(low_offset);
+      for (auto &item : items) {
+        selected_ids_data[low_offset] = item.id;
+        selected_scores_data[low_offset] = item.score;
+        low_offset++;
+      }
+    }
+    low_level.push_back(low_offset);
+    // fill lod
+    framework::LoD lod(2);
+    lod[0].assign(high_level.begin(), high_level.end());
+    lod[1].assign(low_level.begin(), low_level.end());
+    if (!framework::CheckLoD(lod)) {
+      PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+    }
+    selected_ids->set_lod(lod);
+    selected_scores->set_lod(lod);
+  }
+  /*
+   * The basic items help to sort.
+   */
+  struct Item {
+    Item() {}
+    Item(size_t offset, size_t id, float score)
+        : offset(offset), id(id), score(score) {}
+    // offset in the higher lod level.
+    size_t offset;
+    // prefix id in the lower lod level.
+    // size_t prefix;
+    // the candidate id
+    size_t id;
+    // the corresponding score
+    float score;
+    inline bool operator<(const Item &in) const {
+      return (score < in.score) ||
+             ((score == in.score) && (offset < in.offset));
+    }
+    inline void operator=(const Item &in) {
+      offset = in.offset;
+      id = in.id;
+      score = in.score;
+    }
+    std::string ToString() {
+      std::ostringstream os;
+      os << "{";
+      os << "offset: " << offset << ", ";
+      os << "id: " << id << ", ";
+      os << "score: " << score << "";
+      os << "}";
+      return os.str();
+    }
+  };
+ protected:
+  /*
+   * Prune the source sentences all branchs finished, and it is optional.
+   * Pruning must one step later than finishing (thus pre_ids is needed here),
+   * since the end tokens must be writed out.
+   */
+  void PruneEndBeams(const framework::LoDTensor *pre_ids,
+                     const framework::LoD &abs_lod,
+                     std::vector<std::vector<Item>> *items, size_t lod_level,
+                     int end_id) {
+    auto *pre_ids_data = pre_ids->data<int64_t>();
+    auto &high_level = abs_lod[lod_level];
+    for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
+      size_t src_prefix_start = high_level[src_idx];
+      size_t src_prefix_end = high_level[src_idx + 1];
+      bool finish_flag = true;
+      for (size_t offset = src_prefix_start; offset < src_prefix_end;
+           offset++) {
+        for (auto &item : items->at(offset)) {
+          if (item.id != static_cast<size_t>(end_id) ||
+              pre_ids_data[offset] != end_id) {
+            finish_flag = false;
+            break;
+          }
+        }
+        if (!finish_flag) break;
+      }
+      if (finish_flag) {  // all branchs of the beam (source sentence) end and
+                          // prune this beam
+        for (size_t offset = src_prefix_start; offset < src_prefix_end;
+             offset++)
+          items->at(offset).clear();
+      }
+    }
+  }
+  /*
+   * Transform the items into a map whose key is offset, value is the items.
+   * NOTE low performance.
+   */
+  std::vector<std::vector<Item>> ToMap(
+      const std::vector<std::vector<Item>> &items, size_t element_num) {
+    std::vector<std::vector<Item>> result;
+    result.resize(element_num);
+    for (auto &entries : items) {
+      for (const auto &item : entries) {
+        result[item.offset].push_back(item);
+      }
+    }
+    return result;
+  }
+  void Insert(std::vector<Item> *top_beam_ptr, const Item &item,
+              size_t beam_size) {
+    std::vector<Item> &top_beam = *top_beam_ptr;
+    size_t num_beams = top_beam.size();
+    if (num_beams < beam_size) {
+      top_beam.resize(num_beams + 1);
+      num_beams++;
+    } else {
+      if (item < top_beam[beam_size - 1]) {
+        return;
+      }
+    }
+    for (int k = static_cast<int>(num_beams) - 2; k >= 0; --k) {
+      if (top_beam[k] < item) {
+        top_beam[k + 1] = top_beam[k];
+      } else {
+        top_beam[k + 1] = item;
+        return;
+      }
+    }
+    top_beam[0] = item;
+  }
+  /*
+   * For each source, select top beam_size records.
+   */
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
+      const framework::LoDTensor *pre_ids,
+      const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids,
+      const framework::LoDTensor *scores, size_t lod_level, size_t beam_size,
+      int end_id, bool is_accumulated) {
+    std::vector<std::vector<Item>> result;
+    // find the current candidates
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+    auto *pre_ids_data = pre_ids->data<int64_t>();
+    auto *pre_scores_data = pre_scores->data<float>();
+    auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
+    auto *scores_data = scores->data<float>();
+    size_t num_seqs = scores->NumElements(lod_level);
+    size_t seq_width = 1;
+    for (int i = 1; i < scores->dims().size(); i++) {
+      seq_width *= scores->dims()[i];
+    }
+    for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) {
+      size_t seq_offset_start = abs_lod[lod_level][seq_id];
+      size_t seq_offset_end = abs_lod[lod_level][seq_id + 1];
+      std::vector<Item> top_beam;
+      top_beam.reserve(beam_size);
+      for (size_t offset = seq_offset_start; offset < seq_offset_end;
+           ++offset) {
+        auto pre_id = pre_ids_data[offset];
+        auto pre_score = pre_scores_data[offset];
+        if (pre_id == end_id) {
+          // Allocate all probability mass to end_id for finished branchs and
+          // the other candidate ids can be ignored.
+          Item item(offset, end_id, pre_score);
+          Insert(&top_beam, item, beam_size);
+        } else {
+          size_t index = offset * seq_width;
+          for (size_t d = 0; d < seq_width; d++, index++) {
+            int64_t id = ids_data ? ids_data[index] : static_cast<int64_t>(d);
+            float score = is_accumulated
+                              ? scores_data[index]
+                              : pre_score + std::log(scores_data[index]);
+            Item item(offset, id, score);
+            Insert(&top_beam, item, beam_size);
+          }
+        }
+      }
+      result.emplace_back(top_beam);
+    }
+    if (FLAGS_v == 3) {
+      VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
+      for (auto &items : result) {
+        VLOG(3) << "item set:";
+        for (auto &item : items) {
+          VLOG(3) << item.ToString();
+        }
+      }
+    }
+    return result;
+  }
+};
+template class BeamSearchFunctor<platform::CPUDeviceContext, int>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, int64_t>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, float>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/math/beam_search.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+namespace paddle {
+namespace operators {
+namespace math {
+struct Triple {
+  __device__ __forceinline__ Triple() {}
+  __device__ __forceinline__ Triple(int o, int i, float s)
+      : offset(o), id(i), score(s) {}
+  __device__ __forceinline__ void set(int o, int i, float s) {
+    offset = o;
+    id = i;
+    score = s;
+  }
+  __device__ __forceinline__ void operator=(const Triple& in) {
+    offset = in.offset;
+    id = in.id;
+    score = in.score;
+  }
+  __device__ __forceinline__ bool operator<(const float s) const {
+    return score < s;
+  }
+  __device__ __forceinline__ bool operator<(const Triple& in) const {
+    return (score < in.score) || ((score == in.score) && (offset < in.offset));
+  }
+  int offset;
+  int id;
+  float score;
+};
+__device__ __forceinline__ void Insert(Triple* top_beam, const Triple& p,
+                                       int beam_size) {
+  if (p < top_beam[beam_size - 1]) {
+    return;
+  }
+  for (int k = beam_size - 2; k >= 0; --k) {
+    if (top_beam[k] < p) {
+      top_beam[k + 1] = top_beam[k];
+    } else {
+      top_beam[k + 1] = p;
+      return;
+    }
+  }
+  top_beam[0] = p;
+}
+template <int MaxThreadsPerSeq, bool IsAccumulated = true>
+__device__ __forceinline__ int SelectTopBeam(
+    Triple* top_beam, const int64_t* pre_ids, const float* pre_scores,
+    const int64_t* ids, const float* scores, const int seq_offset_start,
+    const int seq_offset_end, const int seq_width, int beam_size, int end_id,
+    int used_threads) {
+  // top_beam is shared memory
+  const int tid = threadIdx.x;
+  const int tid_of_seq = threadIdx.x % MaxThreadsPerSeq;
+  int num_used_threads = used_threads;
+  Triple* top_beam_local = top_beam + tid * beam_size;
+  if (tid_of_seq < num_used_threads) {
+    for (int i = 0; i < beam_size; ++i) {
+      top_beam_local[i].set(-1, -1, -INFINITY);
+    }
+    for (int offset = seq_offset_start; offset < seq_offset_end; ++offset) {
+      int pre_id = static_cast<int>(pre_ids[offset]);
+      if (pre_id == end_id) {
+        if (tid_of_seq == 0) {
+          Triple tmp(offset, end_id, pre_scores[offset]);
+          Insert(top_beam_local, tmp, beam_size);
+        }
+      } else {
+        int index = offset * seq_width + tid_of_seq;
+        if (!IsAccumulated) {
+          float pre_score = pre_scores[offset];
+          for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
+            float score = pre_score + __logf(scores[index]);
+            int id = ids ? static_cast<int>(ids[index]) : i;
+            Triple tmp(offset, id, score);
+            Insert(top_beam_local, tmp, beam_size);
+            index += num_used_threads;
+          }
+        } else {
+          for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
+            int id = ids ? static_cast<int>(ids[index]) : i;
+            float score = scores[index];
+            Triple tmp(offset, id, score);
+            Insert(top_beam_local, tmp, beam_size);
+            index += num_used_threads;
+          }
+        }
+      }
+    }
+  }
+  while (num_used_threads > 1) {
+    if (num_used_threads > 16) {
+      __syncthreads();
+    }
+    num_used_threads = num_used_threads >> 1;
+    if (tid_of_seq < num_used_threads) {
+      int index_in_sh = (num_used_threads + tid) * beam_size;
+      for (int i = 0; i < beam_size; i++) {
+        Insert(top_beam_local, top_beam[index_in_sh], beam_size);
+        index_in_sh++;
+      }
+    }
+  }
+  if (tid_of_seq == 0) {
+    int num_items = 0;
+    for (int i = 0; i < beam_size; ++i) {
+      num_items =
+          (top_beam_local[i].score > -INFINITY) ? num_items + 1 : num_items;
+    }
+    return num_items;
+  }
+  return 0;
+}
+__device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local,
+                                              const int64_t* pre_ids,
+                                              const int end_id, int num_items) {
+  bool finish_flag = true;
+  for (int i = 0; i < num_items; ++i) {
+    int offset = top_beam_local[i].offset;
+    if (top_beam_local[i].id != end_id ||
+        static_cast<int>(pre_ids[offset]) != end_id) {
+      finish_flag = false;
+      break;
+    }
+  }
+  return finish_flag;
+}
+__device__ __forceinline__ void WriteBack(
+    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
+    Triple* top_beam_local, const int seq_offset_start,
+    const int seq_offset_end, const int selected_seq_start,
+    const int selected_seq_length) {
+  const int tid = threadIdx.x;  // use 1 thread only for each sequence
+  int global_index = selected_seq_start;
+  for (int global_offset = seq_offset_start; global_offset < seq_offset_end;
+       ++global_offset) {
+    for (int local_index = 0; local_index < selected_seq_length;
+         ++local_index) {
+      if (top_beam_local[local_index].offset == global_offset) {
+        selected_ids[global_index] =
+            static_cast<int64_t>(top_beam_local[local_index].id);
+        selected_scores[global_index] = top_beam_local[local_index].score;
+        global_index++;
+      }
+    }
+    selected_offsets[global_offset + 1] = static_cast<size_t>(global_index);
+  }
+}
+template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
+__device__ void BeamSearchDetails(
+    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
+    const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
+    const float* scores, const int seq_offset_start, const int seq_offset_end,
+    const int seq_width, int beam_size, int end_id, bool is_accumulated,
+    int num_used_threads) {
+  __shared__ Triple top_beam[MaxLength];
+  int num_items = 0;
+  if (is_accumulated) {
+    num_items = SelectTopBeam<MaxThreadsPerSeq, true>(
+        top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
+        seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
+  } else {
+    num_items = SelectTopBeam<MaxThreadsPerSeq, false>(
+        top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
+        seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
+  }
+  const int tid = threadIdx.x;  // use 1 thread only for each sequence
+  const int tid_of_seq = tid % MaxThreadsPerSeq;
+  if (tid_of_seq == 0) {
+    // Use 1 thread for each sequence.
+    Triple* top_beam_local = top_beam + tid * beam_size;
+    bool finish_flag =
+        PruneEndBeams(top_beam_local, pre_ids, end_id, num_items);
+    int selected_seq_start = 0;
+    int selected_seq_length = finish_flag ? 0 : num_items;
+    if (MaxSeqs > 1) {
+      const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
+      __shared__ int shared_mem[MaxSeqs];
+      // [0, MaxSeqs - 1], length of each sequences
+      shared_mem[seq_id] = selected_seq_length;
+      __syncthreads();
+      for (int s = 0; s < seq_id; ++s) {
+        selected_seq_start += shared_mem[s];
+      }
+      if (seq_id == 0) {
+        selected_offsets[0] = 0;
+      }
+    } else {
+      selected_offsets[0] = 0;
+    }
+    WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local,
+              seq_offset_start, seq_offset_end, selected_seq_start,
+              selected_seq_length);
+  }
+}
+template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
+__global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
+                                 size_t* selected_offsets,
+                                 const int64_t* pre_ids,
+                                 const float* pre_scores, const int64_t* ids,
+                                 const float* scores, const size_t* seq_offsets,
+                                 const int num_seqs, const int seq_width,
+                                 int beam_size, int end_id, bool is_accumulated,
+                                 int num_used_threads) {
+  const int tid = threadIdx.x;
+  const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
+  int seq_offset_start = static_cast<int>(seq_offsets[seq_id]);
+  int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]);
+  BeamSearchDetails<MaxLength, MaxThreadsPerSeq, MaxSeqs>(
+      selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
+      scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
+      is_accumulated, num_used_threads);
+}
+template <int MaxLength, int MaxThreadsPerSeq>
+__global__ void BeamSearchKernelSingle(
+    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
+    const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
+    const float* scores, const int seq_length, const int seq_width,
+    int beam_size, int end_id, bool is_accumulated, int num_used_threads) {
+  const int seq_offset_start = 0;
+  const int seq_offset_end = seq_length;
+  BeamSearchDetails<MaxLength, MaxThreadsPerSeq, 1>(
+      selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
+      scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
+      is_accumulated, num_used_threads);
+}
+static inline int GetNumUsedThreads(const int max_threads_per_seq,
+                                    const int seq_width, int beam_size) {
+  int num_used_threads = (seq_width + beam_size - 1) / beam_size;
+  num_used_threads = max_threads_per_seq < num_used_threads
+                         ? max_threads_per_seq
+                         : num_used_threads;
+  num_used_threads =
+      num_used_threads > 32
+          ? (num_used_threads >> 5) << 5
+          : (num_used_threads > 16
+                 ? 32
+                 : (num_used_threads > 8
+                        ? 16
+                        : (num_used_threads > 4
+                               ? 8
+                               : (num_used_threads > 2 ? 4
+                                                       : num_used_threads))));
+  return num_used_threads;
+}
+template <typename T>
+class BeamSearchFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::LoDTensor* pre_ids,
+                  const framework::LoDTensor* pre_scores,
+                  const framework::LoDTensor* ids,
+                  const framework::LoDTensor* scores,
+                  framework::LoDTensor* selected_ids,
+                  framework::LoDTensor* selected_scores, size_t level,
+                  size_t beam_size, int end_id, bool is_accumulated) {
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+    const int64_t* pre_ids_data = pre_ids->data<int64_t>();
+    const float* pre_scores_data = pre_scores->data<float>();
+    const int64_t* ids_data = ids ? ids->data<int64_t>() : nullptr;
+    const float* scores_data = scores->data<float>();
+    const size_t num_seqs = abs_lod[level].size() - 1;
+    size_t seq_width = 1;
+    for (int i = 1; i < scores->dims().size(); i++) {
+      seq_width *= scores->dims()[i];
+    }
+    // Reserve a big enough memory.
+    auto selected_dims =
+        framework::make_ddim({static_cast<int64_t>(num_seqs * beam_size), 1});
+    int64_t* selected_ids_data =
+        selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
+    float* selected_scores_data =
+        selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
+    framework::LoD selected_lod(2);
+    selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
+    selected_lod[1].resize(scores->dims()[0] + 1);
+    size_t* selected_offsets =
+        selected_lod[1].CUDAMutableData(context.GetPlace());
+    if (num_seqs == 1) {
+      const int seq_length = static_cast<int>(abs_lod[level][1]);
+      const int kMaxThreadsPerSeq = 1024;
+      int num_used_threads =
+          GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
+                            static_cast<int>(beam_size));
+      switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) {
+        CUDA_LAUNCH_KERNEL_HELPER(
+            BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
+                1, kMaxThreadsPerSeq, 0, context.stream()>>>(
+                selected_ids_data, selected_scores_data, selected_offsets,
+                pre_ids_data, pre_scores_data, ids_data, scores_data,
+                seq_length, static_cast<int>(seq_width),
+                static_cast<int>(beam_size), static_cast<int>(end_id),
+                is_accumulated, num_used_threads));
+      }
+    } else if (num_seqs <= 4) {
+      const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace());
+      // Use only 1 block
+      const int kMaxThreadsPerSeq = 32;
+      const int kMaxSeqs = 4;
+      int num_used_threads =
+          GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
+                            static_cast<int>(beam_size));
+      switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) {
+        CUDA_LAUNCH_KERNEL_HELPER(
+            BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
+                1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
+                selected_ids_data, selected_scores_data, selected_offsets,
+                pre_ids_data, pre_scores_data, ids_data, scores_data,
+                seq_offsets, static_cast<int>(num_seqs),
+                static_cast<int>(seq_width), static_cast<int>(beam_size),
+                end_id, is_accumulated, num_used_threads));
+      }
+    } else {
+      LOG(FATAL) << "Not implemented.";
+    }
+    context.Wait();
+    if (!framework::CheckLoD(selected_lod)) {
+      PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod));
+    }
+    selected_ids->set_lod(selected_lod);
+    selected_scores->set_lod(selected_lod);
+    if (selected_lod[1].back() < num_seqs * beam_size) {
+      auto final_selected_dims = framework::make_ddim(
+          {static_cast<int64_t>(selected_lod[1].back()), 1});
+      selected_ids->Resize(final_selected_dims);
+      selected_scores->Resize(final_selected_dims);
+    }
+  }
+};
+template class BeamSearchFunctor<platform::CUDADeviceContext, int>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, int64_t>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, float>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/beam_search.h
+++ b/paddle/fluid/operators/math/beam_search.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * This is an implementation of beam search.
+ *
+ * To explain the details, lets take machine translation task for example, in
+ * this task, one source sentence is translated to multiple target sentences,
+ * during this period, one sentence will be translated to multiple translation
+ * prefixes(target sentence that have not ended), in each time step a prefix
+ * will have some candidates, input the candidate ids and their corresponding
+ * scores (probabilities), it will sort and select the top beam_size candidates
+ * for each source sentence, and store the selected candidates's score and their
+ * corresponding ids to LoDTensors.
+ *
+ * A detailed example:
+ *
+ *  Input
+ *
+ *    ids:
+ *      - LoD (should have 2 levels)
+ *        - first level: [0, 1, 4]
+ *        - second level: [0, 1, 2, 3, 4]
+ *      - tensor's data:
+ *          [[4, 2, 5]
+ *           [2, 1, 3]
+ *           [3, 5, 2]
+ *           [8, 2, 1]]
+ *
+ *    scores:
+ *      - LoD same as `ids`
+ *      - tensor's data
+ *          [[0.5, 0.3, 0.2]
+ *           [0.6, 0.3, 0.1]
+ *           [0.9, 0.5, 0.1]
+ *           [0.7, 0.5, 0.1]]
+ *
+ * The inputs means that there are 2 source sentences to translate, and the
+ * first source has 1 prefix, the second source has 2 prefix.
+ *
+ * Lets assume beam size is 2, and the beam search's output should be
+ *      - LoD
+ *        - first level: [0, 1, 2]
+ *        - second level: [0, 2, 4]
+ *      - id tensor's data
+ *          [[4,
+ *            1,
+ *            3,
+ *            8]]
+ *      - score tensor's data
+ *          [[0.5,
+ *            0.3,
+ *            0.9,
+ *            0.7]]
+ *
+ * TODO all the prune operations should be in the beam search, so it is better
+ * to split the beam search algorithm into a sequence of smaller operators, and
+ * the prune operators can be inserted in this sequence.
+ */
+template <typename DeviceContext, typename T>
+class BeamSearchFunctor {
+ public:
+  /*
+   * The main function of beam search.
+   *
+   * @selected_ids: a [None, 1]-shaped tensor with LoD.
+   *   In a machine translation model, it might be the candidate term id sets,
+   *   each set stored as a varience-length sequence.
+   *   The format might be described with a two-level LoD
+   *   - [[0 1],
+   *      [0 1 2]]
+   *   - [[]
+   *      [0 1]]
+   *   the first level of LoD tells that there are two source sentences. The
+   *   second level describes the details of the candidate id set's offsets in
+   * the source sentences.
+   *
+   *  @selected_scores: a LoD tensor with the same shape and LoD with
+   * selected_ids.
+   *   It stores the corresponding scores of candidate ids in selected_ids.
+   *
+   * Return false if all the input tensor is empty, in machine translation task
+   * that means no candidates is provided, and the task will stop running.
+   */
+  void operator()(const DeviceContext& context,
+                  const framework::LoDTensor* pre_ids,
+                  const framework::LoDTensor* pre_scores,
+                  const framework::LoDTensor* ids,
+                  const framework::LoDTensor* scores,
+                  framework::LoDTensor* selected_ids,
+                  framework::LoDTensor* selected_scores, size_t level,
+                  size_t beam_size, int end_id, bool is_accumulated);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/math/beam_search.h"
+#include <gtest/gtest.h>
+#include <vector>
+void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
+                       paddle::framework::LoDTensor* scores,
+                       paddle::framework::LoDTensor* pre_ids,
+                       paddle::framework::LoDTensor* pre_scores) {
+  // lod
+  paddle::framework::LoD lod;
+  std::vector<size_t> level0({0, 2, 4});
+  std::vector<size_t> level1({0, 1, 2, 3, 4});
+  lod.push_back(level0);
+  lod.push_back(level1);
+  ids->set_lod(lod);
+  scores->set_lod(lod);
+  auto dims = paddle::framework::make_ddim({4, 3});
+  ids->Resize(dims);
+  scores->Resize(dims);
+  paddle::platform::CPUPlace place;
+  auto* ids_data = ids->mutable_data<int64_t>(place);
+  auto* scores_data = scores->mutable_data<float>(place);
+  std::vector<int64_t> ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
+  std::vector<float> scores_vec_data(
+      {0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
+  CHECK_EQ(static_cast<size_t>(ids->numel()), ids_vec_data.size());
+  CHECK_EQ(static_cast<size_t>(ids->numel()), scores_vec_data.size());
+  for (int i = 0; i < ids->numel(); i++) {
+    ids_data[i] = ids_vec_data[i];
+    scores_data[i] = scores_vec_data[i];
+  }
+  // pre_ids
+  pre_ids->Resize(paddle::framework::make_ddim({4, 1}));
+  for (int i = 0; i < 4; i++) {
+    pre_ids->mutable_data<int64_t>(place)[i] = i + 1;
+  }
+  // pre_scores
+  pre_scores->Resize(paddle::framework::make_ddim({4, 1}));
+  for (int i = 0; i < 4; i++) {
+    pre_scores->mutable_data<float>(place)[i] = 0.1 * (i + 1);
+  }
+}
+template <typename DeviceContext, typename Place>
+void TestBeamSearch() {
+  paddle::framework::LoDTensor ids;
+  paddle::framework::LoDTensor scores;
+  paddle::framework::LoDTensor pre_ids;
+  paddle::framework::LoDTensor pre_scores;
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
+  } else {
+    paddle::framework::LoDTensor cpu_ids;
+    paddle::framework::LoDTensor cpu_scores;
+    paddle::framework::LoDTensor cpu_pre_ids;
+    paddle::framework::LoDTensor cpu_pre_scores;
+    PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
+    TensorCopySync(cpu_ids, *place, &ids);
+    TensorCopySync(cpu_scores, *place, &scores);
+    TensorCopySync(cpu_pre_ids, *place, &pre_ids);
+    TensorCopySync(cpu_pre_scores, *place, &pre_scores);
+    ids.set_lod(cpu_ids.lod());
+    scores.set_lod(cpu_scores.lod());
+    pre_ids.set_lod(cpu_pre_ids.lod());
+    pre_scores.set_lod(cpu_pre_scores.lod());
+  }
+  paddle::framework::LoDTensor selected_ids;
+  paddle::framework::LoDTensor selected_scores;
+  size_t level = 0;
+  size_t beam_size = 2;
+  int end_id = 0;
+  paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
+  beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
+             &selected_scores, level, beam_size, end_id, true);
+  ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
+  paddle::framework::LoDTensor cpu_selected_ids;
+  paddle::framework::LoDTensor cpu_selected_scores;
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_selected_ids = selected_ids;
+    cpu_selected_scores = selected_scores;
+  } else {
+    TensorCopySync(selected_ids, paddle::platform::CPUPlace(),
+                   &cpu_selected_ids);
+    TensorCopySync(selected_scores, paddle::platform::CPUPlace(),
+                   &cpu_selected_scores);
+    cpu_selected_ids.set_lod(selected_ids.lod());
+    cpu_selected_scores.set_lod(selected_scores.lod());
+  }
+  std::vector<int64_t> expected_ids({4, 5, 3, 8});
+  std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
+    ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
+  }
+  delete place;
+  delete context;
+}
+TEST(BeamSearch, CPU) {
+  TestBeamSearch<paddle::platform::CPUDeviceContext,
+                 paddle::platform::CPUPlace>();
+}
+#ifdef PADDLE_WITH_CUDA
+TEST(BeamSearch, GPU) {
+  TestBeamSearch<paddle::platform::CUDADeviceContext,
+                 paddle::platform::CUDAPlace>();
+}
+#endif
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/sampler.h"
+#include <glog/logging.h>
 #include <iostream>
 #include <queue>
 #include <utility>
@@ -77,7 +78,14 @@ int64_t CustomSampler::Sample() const {
  auto index = (*int_dist_)(*random_engine_);
  auto p = (*real_dist_)(*random_engine_);
  if (p > alias_probs_[index]) {
-    return alias_[index];
+    int alias = alias_[index];
+    if (alias == exceptional_val) {
+      LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val;
+      return index;
+    }
+    return alias;
  } else {
    return index;
  }

--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -116,6 +116,7 @@ class CustomSampler : public Sampler {
  const float* alias_probs_;
  const int* alias_;
  const float* probs_;
+  const int exceptional_val = -1;
  std::shared_ptr<std::mt19937> random_engine_;
  std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
  std::shared_ptr<std::uniform_int_distribution<>> int_dist_;

--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -354,7 +354,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
  auto* out_data = output->value().data<float>();
  for (size_t i = 0; i < ret_rows.size(); ++i) {
-    for (size_t j = 0; j < row_numel; ++j) {
+    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
      EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
    }
  }

--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -301,7 +301,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
  auto* out_data = output_cpu.data<float>();
  for (size_t i = 0; i < ret_rows.size(); ++i) {
-    for (size_t j = 0; j < row_numel; ++j) {
+    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
      EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
    }
  }

--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -66,7 +66,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
    cpu_in_grad.set_lod(in_grad.lod());
  }
-  EXPECT_EQ(in_grad.numel(), lod[0].back() * second_dim);
+  EXPECT_EQ(in_grad.numel(), static_cast<int64_t>(lod[0].back() * second_dim));
  EXPECT_EQ(in_grad.lod(), lod);
  if (paddle::platform::is_cpu_place(*place)) {

--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -119,6 +119,11 @@ class NCEKernel : public framework::OpKernel<T> {
    PrepareSamples<DeviceContext, T>(context, sampler);
    auto sample_labels = context.Output<Tensor>("SampleLabels");
    const int64_t *sample_labels_data = sample_labels->data<int64_t>();
+    for (int x = 0; x < sample_labels->numel(); x++) {
+      PADDLE_ENFORCE_GE(sample_labels_data[x], 0, "nce sample label %d", x);
+    }
    auto sample_out = context.Output<Tensor>("SampleLogits");
    T *sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
    auto label = context.Input<Tensor>("Label");

--- a/paddle/fluid/operators/ngraph/CMakeLists.txt
+++ b/paddle/fluid/operators/ngraph/CMakeLists.txt
+if(WITH_NGRAPH)
+  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
+  cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto)
+  op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context)
+endif()
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -17,39 +17,39 @@ limitations under the License. */
 #include <vector>
 #include "ngraph/ngraph.hpp"
-#include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/ngraph/ngraph_ops.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 namespace paddle {
-namespace framework {
+namespace operators {
 namespace NG_OPS = paddle::operators::ngraphs;
 std::map<std::string,
-         std::function<void(const std::shared_ptr<OperatorBase>&,
+         std::function<void(const std::shared_ptr<framework::OperatorBase>&,
                            std::shared_ptr<std::unordered_map<
                                std::string, std::shared_ptr<ngraph::Node>>>)>>
    NgraphBridge::NG_NODE_MAP = {
        {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
        {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
-        {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
+        {"fill_constant", NG_OPS::BuildFillConstantNode},
-        {"mean", paddle::operators::ngraphs::BuildMeanNode},
+        {"mean", NG_OPS::BuildMeanNode},
-        {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode},
+        {"mean_grad", NG_OPS::BuildMeanGradNode},
-        {"mul", paddle::operators::ngraphs::BuildMulNode},
+        {"mul", NG_OPS::BuildMulNode},
-        {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
+        {"mul_grad", NG_OPS::BuildMulGradNode},
-        {"softmax", paddle::operators::ngraphs::BuildSoftmaxNode},
+        {"softmax", NG_OPS::BuildSoftmaxNode},
-        {"softmax_grad", paddle::operators::ngraphs::BuildSoftmaxGradNode},
+        {"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
-        {"scale", paddle::operators::ngraphs::BuildScaleNode},
+        {"scale", NG_OPS::BuildScaleNode},
-        {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
+        {"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>},
-        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
+        {"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>},
-        {"top_k", paddle::operators::ngraphs::BuildTopKNode}};
+        {"top_k", NG_OPS::BuildTopKNode}};
-void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
+void NgraphBridge::BuildNgNode(
+    const std::shared_ptr<framework::OperatorBase>& op) {
  auto& op_type = op->Type();
  NG_NODE_MAP[op_type](op, ngb_node_map_);
 }
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
@@ -21,16 +21,16 @@ limitations under the License. */
 #include "ngraph/node.hpp"
-namespace paddle {
+#include "paddle/fluid/framework/operator.h"
-namespace framework {
-class OperatorBase;
+namespace paddle {
+namespace operators {
 class NgraphBridge {
 public:
  static std::map<
      std::string,
-      std::function<void(const std::shared_ptr<OperatorBase>&,
+      std::function<void(const std::shared_ptr<framework::OperatorBase>&,
                         std::shared_ptr<std::unordered_map<
                             std::string, std::shared_ptr<ngraph::Node>>>)>>
      NG_NODE_MAP;
@@ -41,7 +41,7 @@ class NgraphBridge {
          var_node_map)
      : ngb_node_map_(var_node_map) {}
-  void BuildNgNode(const std::shared_ptr<OperatorBase>& op);
+  void BuildNgNode(const std::shared_ptr<framework::OperatorBase>& op);
 private:
  std::shared_ptr<
@@ -49,5 +49,5 @@ class NgraphBridge {
      ngb_node_map_;
 };
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -16,22 +16,25 @@ limitations under the License. */
 #include <algorithm>
 #include <map>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/ngraph_operator.h"
+#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
-#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
 namespace paddle {
-namespace framework {
+namespace operators {
-static ngraph::Shape Ddim2Shape(const DDim& dims) {
+static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
  ngraph::Shape sp;
  for (int i = 0; i < dims.size(); ++i) {
    int k = dims[i];
@@ -41,117 +44,39 @@ static ngraph::Shape Ddim2Shape(const DDim& dims) {
  return sp;
 }
-static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
+static std::map<framework::proto::VarType::Type, ngraph::element::Type>
-    {proto::VarType::FP32, ngraph::element::f32},
+    pd2ng_type_map = {
-    {proto::VarType::FP64, ngraph::element::f64},
+        {framework::proto::VarType::FP32, ngraph::element::f32},
-    {proto::VarType::INT32, ngraph::element::i32},
+        {framework::proto::VarType::FP64, ngraph::element::f64},
-    {proto::VarType::INT64, ngraph::element::i64},
+        {framework::proto::VarType::INT32, ngraph::element::i32},
-    {proto::VarType::BOOL, ngraph::element::boolean},
+        {framework::proto::VarType::INT64, ngraph::element::i64},
+        {framework::proto::VarType::BOOL, ngraph::element::boolean},
 };
-typedef enum {                /* nGraph support state on ops          */
+std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-               FULL_TRAIN,    /* Support full ops for train           */
+    NgraphEngine::func_cache_ = {};
-               PARTIAL_TRAIN, /* Support partial ops for train        */
-               FULL_TEST,     /* Support full list of ops for test    */
-               PARTIAL_TEST   /* Support partial list of ops for test */
-} op_state;
-// perform graph build through bridge and execute computation
-class NgraphEngine {
- public:
-  explicit NgraphEngine(const Scope& scope, const platform::Place& place,
-                        const std::vector<std::shared_ptr<OperatorBase>>& ops,
-                        const std::unordered_map<
-                            std::string, ngraph::element::Type>& var_type_map,
-                        const std::unordered_set<std::string>& persist,
-                        const std::unordered_set<std::string>& fetches,
-                        const std::unordered_set<std::string>& post_op_inputs,
-                        op_state ng_op_state)
-      : scope_(scope),
-        place_(place),
-        fused_ops_(ops),
-        var_type_map_(var_type_map),
-        persistables_(persist),
-        fetches_(fetches),
-        post_op_inputs_(post_op_inputs),
-        ng_op_state_(ng_op_state) {
-    var_in_node_map_ = std::make_shared<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
-    var_node_map_ = std::make_shared<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
-    BuildNgIO();
-    GetNgFunction();
-  }
-  void Run(const Scope& scope, const platform::Place& place) const;
+std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
+    ngraph::runtime::Backend::create("CPU");
- private:
-  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-      func_cache_;
-  const Scope& scope_;
-  const platform::Place& place_;
-  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
-  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::unordered_set<std::string> persistables_;
-  std::unordered_set<std::string> fetches_;
-  std::unordered_set<std::string> post_op_inputs_;
-  op_state ng_op_state_;
-  // ngraph backend eg. CPU
-  static std::shared_ptr<ngraph::runtime::Backend> backend_;
-  // ngraph function to call and execute
-  std::shared_ptr<ngraph::Function> ngraph_function_;
-  // var_name of inputs
-  std::vector<std::string> var_in_;
-  // var_name of outputs from  fetch in order
-  std::vector<std::string> var_out_;
-  // map input vars to nodes
-  std::shared_ptr<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      var_in_node_map_;
-  // map each var name with a ngraph node
-  std::shared_ptr<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      var_node_map_;
-  // cache key to check if function is cached
-  std::shared_ptr<std::string> GetCacheKey();
-  // get ngraph input and define ngraph input parameters
-  void GetNgInputShape(std::shared_ptr<OperatorBase> op);
-  // Call ngraph bridge to map ops
-  void BuildNgNodes();
-  // get the ngraph input and output var list
-  void BuildNgIO();
-  // build ngraph function call
-  void BuildNgFunction();
-  // Check cache for ngraph function or otherwise build the function
-  void GetNgFunction();
-};
-std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+static std::vector<std::vector<int>> NgraphOpIntervals(
-NgraphOperator::NgraphOpIntervals(
+    framework::BlockDesc* block) {
-    std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
+  std::vector<std::vector<int>> intervals;
-  std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+  auto ops = block->AllOps();
-      intervals;
+  int size = ops.size();
-  if (ops->empty()) {
+  int left = 0;
-    return intervals;
+  while (left < size && ops.at(left)->Type() != framework::kFeedOpType) {
-  }
-  size_t size = ops->size();
-  size_t left = 0;
-  while (left < size && ops->at(left)->Type() != kFeedOpType) {
    ++left;
  }
  if (left == size) {
    return intervals;
  }
-  while (left < size && ops->at(left)->Type() == kFeedOpType) {
+  while (left < size && ops.at(left)->Type() == framework::kFeedOpType) {
    ++left;
  }
-  size_t right = left;
+  int right = left;
-  while (right < size && ops->at(right)->Type() != kFetchOpType) {
+  while (right < size && ops.at(right)->Type() != framework::kFetchOpType) {
    ++right;
  }
  if (right == size) {
@@ -160,66 +85,89 @@ NgraphOperator::NgraphOpIntervals(
  if (left >= right) return intervals;
  // (left, right - 1) represents indices between feed and fetch
-  size_t pivot = left;
+  int pivot = left;
  while (pivot < right) {
-    auto op_type = ops->at(pivot)->Type();
+    auto op_type = ops.at(pivot)->Type();
-    if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
+    if (NgraphBridge::NG_NODE_MAP.find(op_type) ==
-        paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
+        NgraphBridge::NG_NODE_MAP.end()) {
      ++pivot;
    } else {
-      size_t start = pivot, end = start;
+      int start = pivot, end = start;
      while (pivot < right &&
-             (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
+             (NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) !=
-                  ops->at(pivot)->Type()) !=
+              NgraphBridge::NG_NODE_MAP.end())) {
-              paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
        ++pivot;
        ++end;
      }
-      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
+      std::vector<int> interval = {start, end};
-          interval = {ops->begin() + start, ops->begin() + end};
      intervals.push_back(interval);
    }
  }  // end while
  return intervals;
 }
-NgraphOperator::NgraphOperator(
+static void SubstituteNgraphOp(framework::BlockDesc* block,
-    const ProgramDesc& prog, size_t block_id,
+                               std::string block_str,
-    std::vector<std::unique_ptr<OperatorBase>>::iterator start,
+                               std::vector<int> interval) {
-    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
+  framework::ProgramDesc program;
-    const std::string& type, const VariableNameMap& inputs,
+  block->RemoveOp(interval.at(0), interval.at(1));
-    const VariableNameMap& outputs, const AttributeMap& attrs)
+  auto* ng_op = block->InsertOp(interval.at(0));
-    : OperatorBase(type, inputs, outputs, attrs),
+  ng_op->SetType("ngraph_engine");
-      pdesc_(prog),
+  ng_op->SetAttr("interval", interval);
-      block_(block_id) {
+  ng_op->SetAttr("graph", block_str);
-  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
+}
-       it != end; ++it) {
-    fused_ops_.push_back(std::move(*it));
-  }
-  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
+// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089
-       (*it)->Type() != kFetchOpType; ++it) {
+void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) {
-    for (auto& var_name_item : (*it)->Inputs()) {
+#ifdef PADDLE_WITH_NGRAPH
-      for (auto& var_name : var_name_item.second) {
+  VLOG(4) << "use_ngraph=True";
-        post_op_inputs_.insert(var_name);
+  for (size_t bid = 0; bid < program.Size(); ++bid) {
-      }
+    // TODO(baojun-nervana): Remove the const_cast
+    auto* block =
+        const_cast<framework::ProgramDesc&>(program).MutableBlock(bid);
+    std::string block_str = block->Proto()->SerializeAsString();
+    auto intervals = NgraphOpIntervals(block);
+    for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
+      SubstituteNgraphOp(block, block_str, *it);
    }
  }
+#else
+  LOG(WARNING)
+      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
+#endif
+}
-  if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
+NgraphEngine::NgraphEngine(const framework::Scope& scope,
-    is_full_ = true;
+                           const platform::Place& place,
-  }
+                           const std::string& serialized_graph,
+                           const std::vector<int>& interval)
+    : scope_(scope), place_(place) {
+  var_in_node_map_ = std::make_shared<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+  var_node_map_ = std::make_shared<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
-  Process();
+  func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) +
+                    serialized_graph;
+  framework::proto::BlockDesc bdesc;
+  bdesc.ParseFromString(serialized_graph);
+  framework::BlockDesc block(nullptr, &bdesc);
+  Prepare(block, interval);
+  BuildNgIO();
+  GetNgFunction();
 }
-void NgraphOperator::Process() {
+void NgraphEngine::Prepare(const framework::BlockDesc& block,
-  auto& bdesc = pdesc_.Block(block_);
+                           const std::vector<int>& interval) {
-  for (auto& var : bdesc.AllVars()) {
+  for (auto& var : block.AllVars()) {
-    if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
+    if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
-          var->GetType() == proto::VarType::LOD_TENSOR ||
+          var->GetType() == framework::proto::VarType::LOD_TENSOR ||
-          var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
+          var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) {
      continue;
    }
@@ -228,7 +176,8 @@ void NgraphOperator::Process() {
      continue;
    }
-    if (var_name != "fetch" && var_name != "feed") {
+    if (var_name != framework::kFeedOpType &&
+        var_name != framework::kFetchOpType) {
      auto pd_type = var->GetDataType();
      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
@@ -242,53 +191,66 @@ void NgraphOperator::Process() {
    }
  }
-  for (auto* op : bdesc.AllOps()) {
+  auto ops_desc = block.AllOps();
-    if (op->Type() == kFetchOpType) {
+  int idx = interval[0];
-      std::string fetch_target_name = op->Input("X")[0];
+  while (idx < interval[1]) {
-      fetches_.insert(fetch_target_name);
+    auto op_desc = ops_desc.at(idx);
-    }
+    auto op = framework::OpRegistry::CreateOp(*op_desc);
+    fused_ops_.push_back(std::move(op));
+    ++idx;
  }
-}
-void NgraphOperator::RunImpl(const Scope& scope,
+  while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
-                             const platform::Place& place) const {
+    auto op_desc = ops_desc.at(idx);
-  op_state ng_op_state = PARTIAL_TEST;
+    for (auto& var_name_item : op_desc->Inputs()) {
-  auto& bdesc = pdesc_.Block(block_);
+      for (auto& var_name : var_name_item.second) {
-  for (auto* op : bdesc.AllOps()) {
+        post_op_inputs_.insert(var_name);
-    if (op->Type().find("_grad") != std::string::npos) {
+      }
-      ng_op_state = PARTIAL_TRAIN;
-      break;
    }
+    ++idx;
  }
-  if (is_full_) {
+  while (idx < static_cast<int>(ops_desc.size()) &&
-    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
+         ops_desc.at(idx)->Type() == framework::kFetchOpType) {
+    std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0];
+    fetches_.insert(fetch_target_name);
+    ++idx;
  }
-  NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_,
+  if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType &&
-                             persistables_, fetches_, post_op_inputs_,
+      ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
-                             ng_op_state);
+    ng_op_state_ = OpState::FULL;
-  ngraph_engine.Run(scope, place);
+  }
-}
-std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+  for (auto* op_desc : ops_desc) {
-    NgraphEngine::func_cache_ = {};
+    if (op_desc->Type().find("_grad") != std::string::npos) {
+      ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN
+                                                   : OpState::PARTIAL_TRAIN;
+      break;
+    }
+  }
-std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
+  if (ng_op_state_ != OpState::FULL_TRAIN &&
-    ngraph::runtime::Backend::create("CPU");
+      ng_op_state_ != OpState::PARTIAL_TRAIN) {
+    ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST
+                                                 : OpState::PARTIAL_TEST;
+  }
+}
-void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
+void NgraphEngine::GetNgInputShape(
-  RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
+    std::shared_ptr<framework::OperatorBase> op) {
+  framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
  op->RuntimeInferShape(scope_, place_, ctx);
  for (auto& var_name_item : op->Inputs()) {
    for (auto& var_name : var_name_item.second) {
      auto* var = scope_.FindVar(var_name);
-      if (var && var->IsType<LoDTensor>()) {
+      if (var && var->IsType<framework::LoDTensor>()) {
        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
        auto sp = Ddim2Shape(tensor_pd->dims());
        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
            var_in_.end()) {
          if (var_node_map_->find(var_name) == var_node_map_->end()) {
+            // auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var));
            auto ng_type = var_type_map_.at(var_name);
            auto prm =
                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
@@ -302,22 +264,25 @@ void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
 }
 void NgraphEngine::BuildNgNodes() {
-  for (auto& var_name : var_out_) {
+  for (auto& op : fused_ops_) {
-    if (var_node_map_->find(var_name) == var_node_map_->end()) {
+    for (auto& var_name_item : op->Outputs()) {
-      auto* var = scope_.FindVar(var_name);
+      for (auto& var_name : var_name_item.second) {
-      if (var && var->IsType<LoDTensor>()) {
+        if (var_node_map_->find(var_name) == var_node_map_->end()) {
-        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+          auto* var = scope_.FindVar(var_name);
-        auto& ddim = tensor_pd->dims();
+          if (var && var->IsType<framework::LoDTensor>()) {
-        auto ng_shape = Ddim2Shape(ddim);
+            auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-        auto ng_type = var_type_map_.at(var_name);
+            auto& ddim = tensor_pd->dims();
-        auto prm =
+            auto ng_shape = Ddim2Shape(ddim);
-            std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true);
+            auto ng_type = var_type_map_.at(var_name);
-        (*var_node_map_)[var_name] = prm;
+            auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
+                                                               ng_shape, true);
+            (*var_node_map_)[var_name] = prm;
+          }
+        }
      }
    }
  }
+  NgraphBridge ngb(var_node_map_);
-  paddle::framework::NgraphBridge ngb(var_node_map_);
  for (auto& op : fused_ops_) {
    ngb.BuildNgNode(op);
  }
@@ -363,25 +328,25 @@ void NgraphEngine::BuildNgIO() {
                        op->Type());
      for (auto& var_name : var_name_item.second) {
        switch (ng_op_state_) {
-          case PARTIAL_TEST:
+          case OpState::PARTIAL_TEST:
            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
                fetches_.find(var_name) != fetches_.end()) {
              var_out_.push_back(var_name);
            }
            break;
-          case FULL_TEST:
+          case OpState::FULL_TEST:
            if (fetches_.find(var_name) != fetches_.end()) {
              var_out_.push_back(var_name);
            }
            break;
-          case PARTIAL_TRAIN:
+          case OpState::PARTIAL_TRAIN:
            if (fetches_.find(var_name) != fetches_.end() ||
                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
                persistables_.find(var_name) != persistables_.end()) {
              var_out_.push_back(var_name);
            }
            break;
-          case FULL_TRAIN:
+          case OpState::FULL_TRAIN:
            if (fetches_.find(var_name) != fetches_.end() ||
                persistables_.find(var_name) != persistables_.end()) {
              var_out_.push_back(var_name);
@@ -416,50 +381,30 @@ void NgraphEngine::BuildNgFunction() {
      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
 }
-std::shared_ptr<std::string> NgraphEngine::GetCacheKey() {
-  auto cache_key = std::make_shared<std::string>("");
-  *cache_key += std::to_string(fused_ops_.size());
-  for (auto& op : fused_ops_) {
-    *cache_key += op->Type();
-  }
-  for (auto& var_name : var_in_) {
-    auto shape = var_node_map_->at(var_name)->get_shape();
-    *cache_key += var_name;
-    *cache_key += var_type_map_.at(var_name).c_type_string();
-    for (size_t i = 0; i < shape.size(); ++i) {
-      *cache_key += std::to_string(shape.at(i));
-    }
-  }
-  for (auto& var_name : var_out_) {
-    auto* var = scope_.FindVar(var_name);
-    if (var && var->IsType<LoDTensor>()) {
-      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      auto& ddim = tensor_pd->dims();
-      for (int i = 0; i < ddim.size(); ++i) {
-        *cache_key += std::to_string(ddim[i]);
-      }
-    }
-  }
-  return cache_key;
-}
 void NgraphEngine::GetNgFunction() {
  bool cache_on = true;
  if (cache_on) {
-    std::string cache_key_val = *GetCacheKey();
+    std::string input_shape_str;
-    if (func_cache_.find(cache_key_val) != func_cache_.end()) {
+    for (auto& var_name : var_in_) {
-      ngraph_function_ = func_cache_.at(cache_key_val);
+      auto shape = var_node_map_->at(var_name)->get_shape();
+      for (size_t i = 0; i < shape.size(); ++i) {
+        input_shape_str += std::to_string(shape.at(i));
+      }
+    }
+    func_cache_key_ = input_shape_str + func_cache_key_;
+    if (func_cache_.find(func_cache_key_) != func_cache_.end()) {
+      ngraph_function_ = func_cache_.at(func_cache_key_);
    } else {
      BuildNgFunction();
-      func_cache_[cache_key_val] = ngraph_function_;
+      func_cache_[func_cache_key_] = ngraph_function_;
    }
  } else {
    BuildNgFunction();
  }
 }
-void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
+void NgraphEngine::Run(const framework::Scope& scope,
+                       const platform::Place& place) const {
  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
@@ -468,37 +413,35 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
    auto sp = var_node_map_->at(vi)->get_shape();
    std::shared_ptr<ngraph::runtime::Tensor> ti;
    auto* var = scope.FindVar(vi);
-    if (var && var->IsType<LoDTensor>()) {
+    if (var && var->IsType<framework::LoDTensor>()) {
-      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
                     "Ensure ngraph tensor layout align with paddle tensor");
-      if (tensor_pd->type() == proto::VarType::FP32) {
+      auto ng_type = var_type_map_.at(vi);
-        const float* arr = tensor_pd->data<float>();
+      if (ng_type == ngraph::element::f32) {
-        ti = backend_->create_tensor(ngraph::element::f32, sp,
+        auto pd_arr = tensor_pd->mutable_data<float>(place);
-                                     const_cast<float*>(arr));
+        ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
-      } else if (tensor_pd->type() == proto::VarType::INT32) {
+      } else if (ng_type == ngraph::element::i32) {
        const int* arr = tensor_pd->data<int>();
        ti = backend_->create_tensor(ngraph::element::i32, sp,
                                     const_cast<int*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::INT64) {
+      } else if (ng_type == ngraph::element::i64) {
-        const int64_t* arr = tensor_pd->data<int64_t>();
+        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
-        ti = backend_->create_tensor(ngraph::element::i64, sp,
+        ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
-                                     const_cast<int64_t*>(arr));
+      } else if (ng_type == ngraph::element::f64) {
-      } else if (tensor_pd->type() == proto::VarType::FP64) {
+        auto pd_arr = tensor_pd->mutable_data<double>(place);
-        const double* arr = tensor_pd->data<double>();
+        ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
-        ti = backend_->create_tensor(ngraph::element::f64, sp,
+      } else if (ng_type == ngraph::element::boolean) {
-                                     const_cast<double*>(arr));
+        auto pd_arr = tensor_pd->mutable_data<bool>(place);
-      } else if (tensor_pd->type() == proto::VarType::BOOL) {
+        ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
-        const bool* arr = tensor_pd->data<bool>();
-        ti = backend_->create_tensor(ngraph::element::boolean, sp,
-                                     const_cast<bool*>(arr));
      } else {
        PADDLE_THROW("Data type not handling for var %s", vi);
      }
    } else {
      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
    }
-    bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST)
+    bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST ||
+                    ng_op_state_ == OpState::FULL_TEST)
                       ? true
                       : false;
    bool is_persistable =
@@ -510,36 +453,39 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
  }
  for (size_t i = 0; i < var_out_.size(); ++i) {
-    auto var_name = var_out_[i];
+    auto vo = var_out_[i];
-    auto* var = scope.FindVar(var_name);
+    auto* var = scope.FindVar(vo);
    std::shared_ptr<ngraph::runtime::Tensor> to;
-    if (var && var->IsType<LoDTensor>()) {
+    if (var && var->IsType<framework::LoDTensor>()) {
      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
      auto dd = tensor_pd->dims();
      ngraph::Shape sp = Ddim2Shape(dd);
-      auto ng_type = var_type_map_.at(var_name);
+      auto ng_type = var_type_map_.at(vo);
      if (ng_type == ngraph::element::f32) {
        auto pd_arr = tensor_pd->mutable_data<float>(place);
-        to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
      } else if (ng_type == ngraph::element::i64) {
        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
-        to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i32) {
+        auto pd_arr = tensor_pd->mutable_data<int>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
      } else if (ng_type == ngraph::element::f64) {
        auto pd_arr = tensor_pd->mutable_data<double>(place);
-        to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
      } else if (ng_type == ngraph::element::boolean) {
        auto pd_arr = tensor_pd->mutable_data<bool>(place);
-        to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
      } else {
-        PADDLE_THROW("Data type not handled in for var %s", var_name);
+        PADDLE_THROW("Data type not handled in for var %s", vo);
      }
      t_out.push_back(to);
    } else {
-      PADDLE_THROW("Cannot find var or tensor with var name %s", var_name);
+      PADDLE_THROW("Cannot find var or tensor with var name %s", vo);
    }
  }
  backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
-}  // NgraphEngine::RunImpl
+}  // NgraphEngine::Run
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "ngraph/ngraph.hpp"
+namespace paddle {
+namespace operators {
+enum class OpState {                /* nGraph support state on ops          */
+                     FULL_TRAIN,    /* Support full ops for train           */
+                     PARTIAL_TRAIN, /* Support partial ops for train        */
+                     FULL_TEST,     /* Support full list of ops for test    */
+                     PARTIAL_TEST,  /* Support partial list of ops for test */
+                     FULL,          /* All ops supported from feed to fetch */
+                     UNKNOWN        /* Output all for debug purpose         */
+};
+// perform graph build through bridge and execute computation
+class NgraphEngine {
+ public:
+  explicit NgraphEngine(const framework::Scope& scope,
+                        const platform::Place& place,
+                        const std::string& serialized_graph,
+                        const std::vector<int>& interval);
+  void Run(const framework::Scope& scope, const platform::Place& place) const;
+  static void EnableNgraph(const framework::ProgramDesc& program);
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+      func_cache_;
+  const framework::Scope& scope_;
+  const platform::Place& place_;
+  std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  OpState ng_op_state_ = OpState::UNKNOWN;
+  std::string func_cache_key_;
+  // ngraph backend eg. CPU
+  static std::shared_ptr<ngraph::runtime::Backend> backend_;
+  // ngraph function to call and execute
+  std::shared_ptr<ngraph::Function> ngraph_function_;
+  // var_name of inputs
+  std::vector<std::string> var_in_;
+  // var_name of outputs from  fetch in order
+  std::vector<std::string> var_out_;
+  // map input vars to nodes
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_in_node_map_;
+  // map each var name with a ngraph node
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_node_map_;
+  // prepare info for nraph engine
+  void Prepare(const framework::BlockDesc& block,
+               const std::vector<int>& interval);
+  // get ngraph input and define ngraph input parameters
+  void GetNgInputShape(std::shared_ptr<framework::OperatorBase> op);
+  // Call ngraph bridge to map ops
+  void BuildNgNodes();
+  // get the ngraph input and output var list
+  void BuildNgIO();
+  // build ngraph function call
+  void BuildNgFunction();
+  // Check cache for ngraph function or otherwise build the function
+  void GetNgFunction();
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <string>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine_op.h"
+namespace paddle {
+namespace operators {
+class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDispensable();
+    AddOutput("Ys", "A list of outputs").AsDispensable();
+    AddAttr<std::string>("graph", "the graph.");
+    AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
+    AddComment("ngraph engine operator.");
+  }
+};
+class NgraphEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ngraph_engine, ops::NgraphEngineOp, ops::NgraphEngineOpMaker,
+                  ops::NgraphEngineOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ngraph_engine,
+    ops::NgraphEngineKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace operators {
+class NgraphEngineOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::proto::VarType::FP32, ctx.GetPlace());
+    return kt;
+  }
+};
+template <typename DeviceContext, typename T>
+class NgraphEngineKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& scope = ctx.scope();
+    auto place = ctx.GetPlace();
+    std::string serialized_graph = ctx.Attr<std::string>("graph");
+    auto interval = ctx.Attr<std::vector<int>>("interval");
+    NgraphEngine ngraph_engine(scope, place, serialized_graph, interval);
+    ngraph_engine.Run(scope, place);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
@@ -41,13 +41,19 @@ class CreateCTRReaderOp : public framework::OperatorBase {
    auto* queue_holder =
        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
-    int thread_num = Attr<int>("thread_num");
+    auto thread_num = Attr<int>("thread_num");
-    std::vector<std::string> slots = Attr<std::vector<std::string>>("slots");
+    auto sparse_slots = Attr<std::vector<std::string>>("sparse_slots");
-    int batch_size = Attr<int>("batch_size");
+    auto dense_slot_index = Attr<std::vector<int>>("dense_slot_index");
-    std::vector<std::string> file_list =
+    auto sparse_slot_index = Attr<std::vector<int>>("sparse_slot_index");
-        Attr<std::vector<std::string>>("file_list");
+    auto batch_size = Attr<int>("batch_size");
-    out->Reset(std::make_shared<CTRReader>(queue_holder->GetQueue(), batch_size,
+    auto file_type = Attr<std::string>("file_type");
-                                           thread_num, slots, file_list));
+    auto file_format = Attr<std::string>("file_format");
+    auto file_list = Attr<std::vector<std::string>>("file_list");
+    DataDesc data_desc(batch_size, file_list, file_type, file_format,
+                       dense_slot_index, sparse_slot_index, sparse_slots);
+    VLOG(1) << data_desc;
+    out->Reset(std::make_shared<CTRReader>(queue_holder->GetQueue(), thread_num,
+                                           data_desc));
  }
 };
@@ -58,10 +64,22 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase {
             "Name of the `LoDTensorBlockingQueueHolder` variable");
    AddAttr<int>("thread_num", "the thread num to read data");
    AddAttr<int>("batch_size", "the batch size of read data");
+    AddAttr<std::string>("file_type", "plain or gzip").SetDefault("plain");
+    AddAttr<std::string>("file_format", "svm or csv").SetDefault("csv");
    AddAttr<std::vector<std::string>>("file_list",
                                      "The list of files that need to read");
-    AddAttr<std::vector<std::string>>(
+    AddAttr<std::vector<int>>(
-        "slots", "the slots that should be extract from file");
+        "dense_slot_index",
+        "the dense slots id that should be extract from file")
+        .SetDefault({});
+    AddAttr<std::vector<int>>(
+        "sparse_slot_index",
+        "the sparse slots id that should be extract from file")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>("sparse_slots",
+                                      "the sparse slots id that should be "
+                                      "extract from file, used when file "
+                                      "format is svm");
    AddComment(R"DOC(
 			Create CTRReader to support read ctr data with cpp.

--- a/paddle/fluid/operators/reader/ctr_reader.cc
+++ b/paddle/fluid/operators/reader/ctr_reader.cc
@@ -73,6 +73,9 @@ static inline void parse_line(
  }
 }
+// label slot1:fea_sign slot2:fea_sign slot1:fea_sign
+static inline void parse_svm_line(const std::string& line) {}
 class Reader {
 public:
  virtual ~Reader() {}
@@ -95,11 +98,27 @@ class GzipReader : public Reader {
  igzstream gzstream_;
 };
-class MultiGzipReader : public Reader {
+class PlainFileReader : public Reader {
 public:
-  explicit MultiGzipReader(const std::vector<std::string>& file_list) {
+  explicit PlainFileReader(const std::string& file_name)
+      : stream_(file_name.c_str()) {}
+  ~PlainFileReader() {}
+  bool HasNext() override { return stream_.peek() != EOF; }
+  void NextLine(std::string* line) override { std::getline(stream_, *line); }
+ private:
+  std::ifstream stream_;
+};
+template <typename SingleFileReader>
+class MultiFileReader : public Reader {
+ public:
+  explicit MultiFileReader(const std::vector<std::string>& file_list) {
    for (auto& file : file_list) {
-      readers_.emplace_back(std::make_shared<GzipReader>(file));
+      readers_.emplace_back(std::make_shared<SingleFileReader>(file));
    }
  }
@@ -119,46 +138,35 @@ class MultiGzipReader : public Reader {
  }
 private:
-  std::vector<std::shared_ptr<GzipReader>> readers_;
+  std::vector<std::shared_ptr<SingleFileReader>> readers_;
  size_t current_reader_index_ = 0;
 };
 void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
                   std::shared_ptr<LoDTensorBlockingQueue> queue) {
-  VLOG(30) << "monitor thread in";
+  VLOG(3) << "monitor thread in";
  bool reader_thread_is_running = true;
  while (reader_thread_is_running) {
-    VLOG(30) << "reader_thread_is_running";
+    VLOG(3) << "reader_thread_is_running";
    reader_thread_is_running = false;
    for (size_t i = 0; i < (*thread_status).size(); ++i) {
      if ((*thread_status)[i] == Running) {
-        VLOG(30) << "reader is running!";
+        VLOG(3) << "reader is running!";
        reader_thread_is_running = true;
      }
    }
    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
  }
-  VLOG(30) << "all reader thread is stopped, push empty data into queue";
+  VLOG(3) << "all reader thread is stopped, close the queue";
-  queue->Push({});
+  queue->Close();
-  VLOG(30) << "monitor thread exited";
+  VLOG(3) << "monitor thread exited";
 }
-void ReadThread(const std::vector<std::string>& file_list,
+void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
-                const std::vector<std::string>& slots, int batch_size,
+                 std::shared_ptr<LoDTensorBlockingQueue> queue) {
-                int thread_id, std::vector<ReaderThreadStatus>* thread_status,
-                std::shared_ptr<LoDTensorBlockingQueue> queue) {
-  VLOG(30) << "[" << thread_id << "]"
-           << " reader thread start! thread_id = " << thread_id;
-  for (auto& file : file_list) {
-    VLOG(30) << "[" << thread_id << "]"
-             << " file " << file;
-  }
-  (*thread_status)[thread_id] = Running;
-  VLOG(30) << "set status to running";
  std::unordered_map<std::string, size_t> slot_to_index;
-  for (size_t i = 0; i < slots.size(); ++i) {
+  for (size_t i = 0; i < data_desc.sparse_slot_ids_.size(); ++i) {
-    slot_to_index[slots[i]] = i;
+    slot_to_index[data_desc.sparse_slot_ids_[i]] = i;
  }
  std::string line;
@@ -166,21 +174,17 @@ void ReadThread(const std::vector<std::string>& file_list,
  std::vector<std::unordered_map<std::string, std::vector<int64_t>>> batch_data;
  std::vector<int64_t> batch_label;
-  MultiGzipReader reader(file_list);
+  while (reader->HasNext()) {
-  VLOG(30) << "reader inited";
-  while (reader.HasNext()) {
    batch_data.clear();
-    batch_data.reserve(batch_size);
+    batch_data.reserve(data_desc.batch_size_);
    batch_label.clear();
-    batch_label.reserve(batch_size);
+    batch_label.reserve(data_desc.batch_size_);
    // read batch_size data
-    for (int i = 0; i < batch_size; ++i) {
+    for (int i = 0; i < data_desc.batch_size_; ++i) {
-      if (reader.HasNext()) {
+      if (reader->HasNext()) {
-        reader.NextLine(&line);
+        reader->NextLine(&line);
        std::unordered_map<std::string, std::vector<int64_t>> slot_to_data;
        int64_t label;
        parse_line(line, slot_to_index, &label, &slot_to_data);
@@ -193,8 +197,8 @@ void ReadThread(const std::vector<std::string>& file_list,
    std::vector<framework::LoDTensor> lod_datas;
-    // first insert tensor for each slots
+    // first insert tensor for each sparse_slots
-    for (auto& slot : slots) {
+    for (auto& slot : data_desc.sparse_slot_ids_) {
      std::vector<size_t> lod_data{0};
      std::vector<int64_t> batch_feasign;
@@ -226,11 +230,167 @@ void ReadThread(const std::vector<std::string>& file_list,
    lod_datas.push_back(label_tensor);
    queue->Push(lod_datas);
-    VLOG(40) << "push one data, queue_size=" << queue->Size();
+    VLOG(4) << "push one data, queue_size=" << queue->Size();
+  }
+}
+// label dense_fea,dense_fea sparse_fea,sparse_fea
+static inline void parse_csv_line(
+    const std::string& line, const DataDesc& data_desc, int64_t* label,
+    std::vector<std::vector<float>>* dense_datas,
+    std::vector<std::vector<int64_t>>* sparse_datas) {
+  std::vector<std::string> ret;
+  string_split(line, ' ', &ret);
+  *label = std::stol(ret[0]);
+  dense_datas->resize(data_desc.dense_slot_index_.size());
+  for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) {
+    int slot_idx = data_desc.dense_slot_index_[i];
+    auto& slot_data = ret[slot_idx];
+    std::vector<std::string> data_in_slot_str;
+    string_split(slot_data, ',', &data_in_slot_str);
+    std::vector<float> data_in_slot;
+    for (auto& data_str : data_in_slot_str) {
+      (*dense_datas)[i].push_back(std::stof(data_str));
+    }
+  }
+  sparse_datas->resize(data_desc.sparse_slot_index_.size());
+  for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) {
+    int slot_idx = data_desc.sparse_slot_index_[i];
+    auto& slot_data = ret[slot_idx];
+    std::vector<std::string> data_in_slot_str;
+    string_split(slot_data, ',', &data_in_slot_str);
+    std::vector<int64_t> data_in_slot;
+    for (auto& data_str : data_in_slot_str) {
+      auto id = std::stol(data_str);
+      (*sparse_datas)[i].push_back(id);
+    }
+  }
+}
+void ReadCsvData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
+                 std::shared_ptr<LoDTensorBlockingQueue> queue) {
+  std::string line;
+  while (reader->HasNext()) {
+    std::vector<int64_t> batch_label;
+    batch_label.reserve(data_desc.batch_size_);
+    std::vector<std::vector<std::vector<float>>> batch_dense_data;
+    batch_dense_data.reserve(data_desc.batch_size_);
+    std::vector<std::vector<std::vector<int64_t>>> batch_sparse_data;
+    batch_sparse_data.reserve(data_desc.batch_size_);
+    // read batch_size data
+    for (int i = 0; i < data_desc.batch_size_; ++i) {
+      if (reader->HasNext()) {
+        reader->NextLine(&line);
+        int64_t label;
+        std::vector<std::vector<float>> dense_datas;
+        std::vector<std::vector<int64_t>> sparse_datas;
+        parse_csv_line(line, data_desc, &label, &dense_datas, &sparse_datas);
+        batch_label.push_back(label);
+        if (!batch_dense_data.empty()) {
+          PADDLE_ENFORCE_EQ(batch_dense_data[0].size(), dense_datas.size(),
+                            "dense data should have the same shape");
+        }
+        batch_dense_data.push_back(dense_datas);
+        batch_sparse_data.push_back(sparse_datas);
+      } else {
+        break;
+      }
+    }
+    // the order of output data is label, dense_datas, sparse_datas
+    std::vector<framework::LoDTensor> lod_datas;
+    // insert label tensor
+    framework::LoDTensor label_tensor;
+    auto* label_tensor_data = label_tensor.mutable_data<int64_t>(
+        framework::make_ddim({static_cast<int64_t>(batch_label.size()), 1}),
+        platform::CPUPlace());
+    memcpy(label_tensor_data, batch_label.data(),
+           batch_label.size() * sizeof(int64_t));
+    lod_datas.push_back(label_tensor);
+    // insert tensor for each dense_slots
+    for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) {
+      framework::LoDTensor lod_tensor;
+      size_t width = batch_dense_data[0][i].size();
+      auto* tensor_data = lod_tensor.mutable_data<float>(
+          framework::make_ddim(
+              {static_cast<int64_t>(batch_dense_data.size()),  // batch_size
+               static_cast<int64_t>(width)}),
+          platform::CPUPlace());
+      for (size_t j = 0; j < batch_dense_data.size(); ++j) {
+        auto& dense_data_row = batch_dense_data[j][i];
+        memcpy(tensor_data + j * width, dense_data_row.data(),
+               width * sizeof(float));
+      }
+      lod_datas.push_back(lod_tensor);
+    }
+    // insert tensor for each sparse_slots
+    for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) {
+      std::vector<size_t> lod_data{0};
+      std::vector<int64_t> batch_feasign;
+      for (size_t row_idx = 0; row_idx < batch_sparse_data.size(); ++row_idx) {
+        auto& sparse_ids = batch_sparse_data[row_idx][i];
+        lod_data.push_back(lod_data.back() + sparse_ids.size());
+        batch_feasign.insert(batch_feasign.end(), sparse_ids.begin(),
+                             sparse_ids.end());
+      }
+      framework::LoDTensor lod_tensor;
+      framework::LoD lod{lod_data};
+      lod_tensor.set_lod(lod);
+      int64_t* tensor_data = lod_tensor.mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(batch_feasign.size()), 1}),
+          platform::CPUPlace());
+      memcpy(tensor_data, batch_feasign.data(),
+             batch_feasign.size() * sizeof(int64_t));
+      lod_datas.push_back(lod_tensor);
+    }
+    queue->Push(lod_datas);
+    VLOG(4) << "push one data, queue_size=" << queue->Size();
+  }
+}
+void ReadThread(const std::vector<std::string>& file_list,
+                const DataDesc& data_desc, int thread_id,
+                std::vector<ReaderThreadStatus>* thread_status,
+                std::shared_ptr<LoDTensorBlockingQueue> queue) {
+  VLOG(3) << "[" << thread_id << "]"
+          << " reader thread start! thread_id = " << thread_id;
+  for (auto& file : file_list) {
+    VLOG(3) << "[" << thread_id << "]"
+            << " file " << file;
+  }
+  (*thread_status)[thread_id] = Running;
+  VLOG(3) << "set status to running";
+  std::shared_ptr<Reader> reader;
+  if (data_desc.file_type_ == "gzip") {
+    reader.reset(new MultiFileReader<GzipReader>(file_list));
+  } else if (data_desc.file_type_ == "plain") {
+    reader.reset(new MultiFileReader<PlainFileReader>(file_list));
+  } else {
+    PADDLE_THROW("do not support file format %s", data_desc.file_type_);
+  }
+  VLOG(3) << "reader inited";
+  if (data_desc.file_format_ == "svm") {
+    ReadSvmData(data_desc, reader, queue);
+  } else if (data_desc.file_format_ == "csv") {
+    ReadCsvData(data_desc, reader, queue);
  }
  (*thread_status)[thread_id] = Stopped;
-  VLOG(30) << "set status to stopped, thread " << thread_id << " exited";
+  VLOG(3) << "set status to stopped, thread " << thread_id << " exited";
 }
 }  // namespace reader

--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -36,9 +36,63 @@ namespace reader {
 enum ReaderThreadStatus { Running, Stopped };
+struct DataDesc {
+  DataDesc(int batch_size, const std::vector<std::string>& file_names,
+           const std::string& file_type, const std::string& file_format,
+           const std::vector<int>& dense_slot_index,
+           const std::vector<int>& sparse_slot_index,
+           const std::vector<std::string>& sparse_slot_ids)
+      : batch_size_(batch_size),
+        file_names_(file_names),
+        file_type_(file_type),
+        file_format_(file_format),
+        dense_slot_index_(dense_slot_index),
+        sparse_slot_index_(sparse_slot_index),
+        sparse_slot_ids_(sparse_slot_ids) {}
+  const int batch_size_;
+  const std::vector<std::string> file_names_;
+  const std::string file_type_;    // gzip or plain
+  const std::string file_format_;  // csv or svm
+  // used for csv data format
+  const std::vector<int> dense_slot_index_;
+  const std::vector<int> sparse_slot_index_;
+  // used for svm data format
+  const std::vector<std::string> sparse_slot_ids_;
+};
+inline std::ostream& operator<<(std::ostream& os, const DataDesc& data_desc) {
+  os << "data_desc:\n";
+  os << "\tbatch_size -> " << data_desc.batch_size_ << "\n";
+  os << "\tfile_type -> " << data_desc.file_type_ << "\n";
+  os << "\tfile_format -> " << data_desc.file_format_ << "\n";
+  os << "\tfile_names -> {";
+  for (auto& file_name : data_desc.file_names_) {
+    os << file_name << ",";
+  }
+  os << "}\n";
+  os << "\tdense_slot_index -> {";
+  for (auto& slot : data_desc.dense_slot_index_) {
+    os << slot << ",";
+  }
+  os << "}\n";
+  os << "\tsparse_slot_index_ -> {";
+  for (auto& slot : data_desc.sparse_slot_index_) {
+    os << slot << ",";
+  }
+  os << "}\n";
+  os << "\tsparse_slot_ids_ -> {";
+  for (auto& slot : data_desc.sparse_slot_ids_) {
+    os << slot << ",";
+  }
+  os << "}\n";
+  return os;
+}
 void ReadThread(const std::vector<std::string>& file_list,
-                const std::vector<std::string>& slots, int batch_size,
+                const DataDesc& data_desc, int thread_id,
-                int thread_id, std::vector<ReaderThreadStatus>* thread_status,
+                std::vector<ReaderThreadStatus>* thread_status,
                std::shared_ptr<LoDTensorBlockingQueue> queue);
 // monitor all running thread, if they are all stopped,
@@ -48,15 +102,15 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
 class CTRReader : public framework::FileReader {
 public:
-  explicit CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue,
+  CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue,
-                     int batch_size, size_t thread_num,
+            int thread_num, const DataDesc& data_desc)
-                     const std::vector<std::string>& slots,
+      : data_desc_(data_desc) {
-                     const std::vector<std::string>& file_list)
-      : batch_size_(batch_size), slots_(slots), file_list_(file_list) {
    PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!");
    PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
-    PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty");
+    PADDLE_ENFORCE_GT(data_desc_.file_names_.size(), 0,
-    thread_num_ = std::min<size_t>(file_list_.size(), thread_num);
+                      "file list should not be empty");
+    thread_num_ = std::min<size_t>(data_desc_.file_names_.size(), thread_num);
    queue_ = queue;
    SplitFiles();
    for (size_t i = 0; i < thread_num_; ++i) {
@@ -64,7 +118,7 @@ class CTRReader : public framework::FileReader {
    }
  }
-  ~CTRReader() {}
+  ~CTRReader() { Shutdown(); }
  void ReadNext(std::vector<framework::LoDTensor>* out) override {
    bool success;
@@ -81,7 +135,10 @@ class CTRReader : public framework::FileReader {
    for (auto& read_thread : read_threads_) {
      read_thread->join();
    }
-    monitor_thread_->join();
+    if (monitor_thread_) {
+      monitor_thread_->join();
+    }
    read_threads_.clear();
    monitor_thread_.reset(nullptr);
@@ -95,9 +152,9 @@ class CTRReader : public framework::FileReader {
    queue_->ReOpen();
    VLOG(3) << "reopen success";
    VLOG(3) << "thread_num " << thread_num_;
-    for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) {
+    for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
      read_threads_.emplace_back(new std::thread(std::bind(
-          &ReadThread, file_groups_[thread_id], slots_, batch_size_,
+          &ReadThread, file_groups_[thread_id], data_desc_,
          static_cast<int>(thread_id), &read_thread_status_, queue_)));
    }
    monitor_thread_.reset(new std::thread(
@@ -108,8 +165,8 @@ class CTRReader : public framework::FileReader {
 private:
  void SplitFiles() {
    file_groups_.resize(thread_num_);
-    for (size_t i = 0; i < file_list_.size(); ++i) {
+    for (size_t i = 0; i < data_desc_.file_names_.size(); ++i) {
-      auto& file_name = file_list_[i];
+      auto& file_name = data_desc_.file_names_[i];
      std::ifstream f(file_name.c_str());
      PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name);
      file_groups_[i % thread_num_].push_back(file_name);
@@ -118,9 +175,7 @@ class CTRReader : public framework::FileReader {
 private:
  size_t thread_num_;
-  const int batch_size_;
+  const DataDesc data_desc_;
-  const std::vector<std::string> slots_;
-  const std::vector<std::string> file_list_;
  std::shared_ptr<LoDTensorBlockingQueue> queue_;
  std::vector<std::unique_ptr<std::thread>> read_threads_;
  std::unique_ptr<std::thread> monitor_thread_;

--- a/paddle/fluid/operators/reader/ctr_reader_test.cc
+++ b/paddle/fluid/operators/reader/ctr_reader_test.cc
@@ -36,6 +36,7 @@ using paddle::framework::LoD;
 using paddle::framework::DDim;
 using paddle::platform::CPUPlace;
 using paddle::framework::make_ddim;
+using paddle::operators::reader::DataDesc;
 static void generatedata(const std::vector<std::string>& data,
                         const std::string& file_name) {
@@ -126,30 +127,103 @@ TEST(CTR_READER, read_data) {
  LoDTensorBlockingQueueHolder queue_holder;
  int capacity = 64;
-  queue_holder.InitOnce(capacity, {}, false);
+  queue_holder.InitOnce(capacity, false);
  std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue();
  int batch_size = 3;
  int thread_num = 1;
-  std::vector<std::string> slots = {"6002", "6003"};
+  std::vector<std::string> sparse_slots = {"6002", "6003"};
  std::vector<std::string> file_list;
  for (int i = 0; i < thread_num; ++i) {
    file_list.push_back(gz_file_name);
  }
-  CTRReader reader(queue, batch_size, thread_num, slots, file_list);
+  DataDesc data_desc(batch_size, file_list, "gzip", "svm", {}, {},
+                     sparse_slots);
+  CTRReader reader(queue, thread_num, data_desc);
  reader.Start();
  size_t batch_num =
      std::ceil(static_cast<float>(ctr_data.size()) / batch_size) * thread_num;
-  check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002,
+  check_all_data(ctr_data, sparse_slots, label_dims, label_value,
-                 data_slot_6003, batch_num, batch_size, queue, &reader);
+                 data_slot_6002, data_slot_6003, batch_num, batch_size, queue,
+                 &reader);
  reader.Shutdown();
  reader.Start();
-  check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002,
+  check_all_data(ctr_data, sparse_slots, label_dims, label_value,
-                 data_slot_6003, batch_num, batch_size, queue, &reader);
+                 data_slot_6002, data_slot_6003, batch_num, batch_size, queue,
+                 &reader);
  reader.Shutdown();
 }
+static void GenereteCsvData(const std::string& file_name,
+                            const std::vector<std::string>& data) {
+  std::ofstream out(file_name.c_str());
+  PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name);
+  for (auto& c : data) {
+    out << c;
+  }
+  out.close();
+  PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name);
+}
+static void CheckReadCsvOut(const std::vector<LoDTensor>& out) {
+  ASSERT_EQ(out.size(), 3);
+  ASSERT_EQ(out[0].dims()[1], 1);
+  ASSERT_EQ(out[1].dims()[1], 2);
+  ASSERT_EQ(out[2].dims()[1], 1);
+  for (size_t i = 0; i < out[0].numel(); ++i) {
+    int64_t label = out[0].data<int64_t>()[i];
+    auto& dense_dim = out[1].dims();
+    for (size_t j = 0; j < dense_dim[1]; ++j) {
+      ASSERT_EQ(out[1].data<float>()[i * dense_dim[1] + j],
+                static_cast<float>(label + 0.1));
+    }
+    auto& sparse_lod = out[2].lod();
+    for (size_t j = sparse_lod[0][i]; j < sparse_lod[0][i + 1]; ++j) {
+      ASSERT_EQ(out[2].data<int64_t>()[j], label);
+    }
+  }
+}
+TEST(CTR_READER, read_csv_data) {
+  std::string file_name = "test_ctr_reader_data.csv";
+  const std::vector<std::string> csv_data = {
+      "0 0.1,0.1 0,0,0,0\n", "1 1.1,1.1 1,1,1,1\n", "2 2.1,2.1 2,2,2,2\n",
+      "3 3.1,3.1 3,3,3,3\n",
+  };
+  GenereteCsvData(file_name, csv_data);
+  LoDTensorBlockingQueueHolder queue_holder;
+  int capacity = 64;
+  queue_holder.InitOnce(capacity, false);
+  std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue();
+  int batch_size = 3;
+  int thread_num = 1;
+  std::vector<std::string> file_list;
+  for (int i = 0; i < thread_num; ++i) {
+    file_list.push_back(file_name);
+  }
+  DataDesc data_desc(batch_size, file_list, "plain", "csv", {1}, {2}, {});
+  CTRReader reader(queue, thread_num, data_desc);
+  for (size_t i = 0; i < 2; ++i) {
+    reader.Start();
+    std::vector<LoDTensor> out;
+    while (true) {
+      reader.ReadNext(&out);
+      if (out.empty()) {
+        break;
+      }
+      CheckReadCsvOut(out);
+    }
+    reader.Shutdown();
+  }
+}
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -32,10 +32,8 @@ class LoDTensorBlockingQueue {
  friend class LoDTensorBlockingQueueHolder;
 private:
-  LoDTensorBlockingQueue(size_t capacity,
+  explicit LoDTensorBlockingQueue(size_t capacity, bool speed_test_mode = false)
-                         const std::vector<framework::DDim>& dims,
+      : queue_(capacity, speed_test_mode) {}
-                         bool speed_test_mode = false)
-      : queue_(capacity, speed_test_mode), dims_(dims) {}
 public:
  bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
@@ -65,17 +63,15 @@ class LoDTensorBlockingQueue {
 private:
  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
-  std::vector<framework::DDim> dims_;
 };
 class LoDTensorBlockingQueueHolder {
 public:
-  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
+  void InitOnce(size_t capacity, bool speed_test_mode = false) {
-                bool speed_test_mode = false) {
    PADDLE_ENFORCE(
        queue_ == nullptr,
        "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
-    queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode));
+    queue_.reset(new LoDTensorBlockingQueue(capacity, speed_test_mode));
  }
  inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {

--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -27,13 +27,13 @@ class ReadInferShape : public framework::InferShapeBase {
                   "The ReadOp must take a reader as input.");
    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
                   "The ReadOp should be assigned with output.");
-    std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
+    if (!ctx->IsRuntime() && ctx->Attrs().Get<bool>("infer_out")) {
-    std::vector<std::string> out_names = ctx->Outputs("Out");
+      std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
-    PADDLE_ENFORCE_EQ(
+      std::vector<std::string> out_names = ctx->Outputs("Out");
-        reader_dims.size(), out_names.size(),
+      PADDLE_ENFORCE_EQ(
-        "The reader's dim number doesn't match the output number.");
+          reader_dims.size(), out_names.size(),
-    ctx->SetOutputsDim("Out", reader_dims);
+          "The reader's dim number doesn't match the output number.");
-    if (!ctx->IsRuntime()) {
+      ctx->SetOutputsDim("Out", reader_dims);
      auto in_desc =
          boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Reader")[0]);
      auto in_lod_levels = in_desc->GetLoDLevels();
@@ -53,15 +53,18 @@ class ReadInferVarType : public framework::VarTypeInference {
 public:
  void operator()(const framework::OpDesc& op_desc,
                  framework::BlockDesc* block) const override {
-    std::string reader_name = op_desc.Input("Reader")[0];
+    bool infer_out = boost::get<bool>(op_desc.GetAttr("infer_out"));
-    std::vector<std::string> out_names = op_desc.Output("Out");
+    if (infer_out) {
-    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+      std::string reader_name = op_desc.Input("Reader")[0];
-    auto dtypes = reader->GetDataTypes();
+      std::vector<std::string> out_names = op_desc.Output("Out");
-    PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
+      framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-    for (size_t i = 0; i < dtypes.size(); ++i) {
+      auto dtypes = reader->GetDataTypes();
-      framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
+      PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
-      out.SetType(framework::proto::VarType::LOD_TENSOR);
+      for (size_t i = 0; i < dtypes.size(); ++i) {
-      out.SetDataType(dtypes[i]);
+        framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
+        out.SetType(framework::proto::VarType::LOD_TENSOR);
+        out.SetDataType(dtypes[i]);
+      }
    }
  }
 };
@@ -73,6 +76,7 @@ class ReadOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope& scope,
               const platform::Place& dev_place) const override {
+    VLOG(3) << "read op in";
    framework::ReaderHolder* reader =
        detail::Ref(scope.FindVar(Input("Reader")),
                    "Cannot find reader variable %s", Input("Reader"))
@@ -87,7 +91,9 @@ class ReadOp : public framework::OperatorBase {
    reader->ReadNext(&ins);
    if (ins.empty()) {
+      VLOG(3) << "read empty data in";
      if (Attr<bool>("throw_eof_exp")) {
+        VLOG(3) << "throw_eof_exp";
        PADDLE_THROW_EOF();
      } else {
        ins.resize(out_arg_names.size());
@@ -96,6 +102,7 @@ class ReadOp : public framework::OperatorBase {
          tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
        }
      }
+      VLOG(3) << "read empty data out";
    }
    PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
    for (size_t i = 0; i < out_arg_names.size(); ++i) {
@@ -120,6 +127,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
        " only when the data-balance is enabled in ParallelExecutor"
        " and it is set by ParallelExecutor instance, not users.")
        .SetDefault(true);
+    AddAttr<bool>("infer_out", "").SetDefault(true);
    AddComment(R"DOC(
      Read Operator

--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() {
      "It means the reader will generate two data each time,"
      "whose shapes are [2,3,4] and [5,6] respectively.");
  AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
+  AddAttr<bool>(
+      "use_data_config",
+      "Use the config of all datas like shape_concat/ranks/lod_levels")
+      .SetDefault(true);
  Apply();
 }
@@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE(ctx->HasOutput("Out"),
                 "The output file reader should not be null.");
-  const auto shape_concat = ctx->Attrs().Get<std::vector<int>>("shape_concat");
+  bool use_data_config = ctx->Attrs().Get<bool>("use_data_config");
-  const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
+  if (use_data_config) {
-  std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    const auto shape_concat =
-  ctx->SetReaderDims("Out", shapes);
+        ctx->Attrs().Get<std::vector<int>>("shape_concat");
+    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
-  const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-  PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
+    ctx->SetReaderDims("Out", shapes);
-                    "The number of 'lod_levels'(%d) doesn't match the number "
-                    "of 'shapes'(%d).",
+    const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
-                    lod_levels.size(), shapes.size());
+    PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
-  framework::VarDesc* reader =
+                      "The number of 'lod_levels'(%d) doesn't match the number "
-      boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+                      "of 'shapes'(%d).",
-  reader->SetLoDLevels(lod_levels);
+                      lod_levels.size(), shapes.size());
+    framework::VarDesc* reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    reader->SetLoDLevels(lod_levels);
+  }
 }
 void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,

--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/shuffle_channel_op.h"
+namespace paddle {
+namespace operators {
+class ShuffleChannelOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ShuffleChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ShuffleChannelOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+    ctx->SetOutputDim("Out", input_dims);
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of ShuffleChannelOp, the layout is NCHW.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), the output of "
+              "ShuffleChannelOp. The layout is NCHW.");
+    AddAttr<int>("group", "the number of groups.")
+        .SetDefault(1)
+        .AddCustomChecker([](const int& group) {
+          PADDLE_ENFORCE_GE(group, 1, "group should be larger than 0.");
+        });
+    AddComment(R"DOC(
+		Shuffle Channel operator
+		This opearator shuffles the channels of input x.
+		It  divide the input channels in each group into several subgroups,
+		and obtain a new order by selecting element from every subgroup one by one.
+		Shuffle channel operation makes it possible to build more powerful structures
+		with multiple group convolutional layers.
+		please get more information from the following paper:
+		https://arxiv.org/pdf/1707.01083.pdf
+        )DOC");
+  }
+};
+class ShuffleChannelGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@Grad) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@Grad) should not be null");
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+    ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp,
+                  ops::ShuffleChannelOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp);
+REGISTER_OP_CPU_KERNEL(
+    shuffle_channel,
+    ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    shuffle_channel_grad,
+    ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext,
+                                    double>);
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/shuffle_channel_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+template <typename T>
+__global__ void ShuffleChannel(const int nthreads, const int feature_map_size,
+                               T* output, const T* input, int group_row,
+                               int group_column, int len) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t ii = index; ii < nthreads; ii += offset) {
+    const int n = index / group_row / group_column / len;
+    const int i = (index / group_column / len) % group_row;
+    const int j = index / len % group_column;
+    const int k = index - (n * feature_map_size + (i * group_column + j) * len);
+    T* p_o = output + n * feature_map_size + (j * group_row + i) * len;
+    p_o[k] = input[index];
+  }
+}
+template <typename DeviceContext, typename T>
+class ShuffleChannelOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    int group = ctx.Attr<int>("group");
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+    int group_row = group;
+    int group_column = channel / group_row;
+    // count is the product of NCHW same as numel()
+    int count = num * group_column * group_row * sp_sz;
+    int blocks = NumBlocks(output->numel());
+    int threads = kNumCUDAThreads;
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    ShuffleChannel<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        count, feature_map_size, output_data, input_data, group_row,
+        group_column, sp_sz);
+  }
+};
+template <typename DeviceContext, typename T>
+class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    int group = ctx.Attr<int>("group");
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+    int group_row = group;
+    int group_column = channel / group_row;
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    const T* output_grad_data = output_grad->data<T>();
+    int blocks = NumBlocks(output_grad->numel());
+    int threads = kNumCUDAThreads;
+    int count = num * group_column * group_row * sp_sz;
+    ShuffleChannel<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        count, feature_map_size, input_grad_data, output_grad_data, group_row,
+        group_column, sp_sz);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    shuffle_channel,
+    ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                    double>);
+REGISTER_OP_CUDA_KERNEL(
+    shuffle_channel_grad,
+    ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                        float>,
+    ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                        double>);
--- a/paddle/fluid/operators/shuffle_channel_op.h
+++ b/paddle/fluid/operators/shuffle_channel_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class ShuffleChannelOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    int group = ctx.Attr<int>("group");
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+    int group_row = group;
+    int group_column = channel / group_row;
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    for (int n = 0; n < num; ++n) {
+      for (int i = 0; i < group_row; ++i) {
+        for (int j = 0; j < group_column; ++j) {
+          const T* p_i = input_data + n * feature_map_size +
+                         (i * group_column + j) * sp_sz;
+          T* p_o =
+              output_data + n * feature_map_size + (j * group_row + i) * sp_sz;
+          memcpy(p_o, p_i, sizeof(int) * sp_sz);
+        }
+      }
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class ShuffleChannelGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    int group = ctx.Attr<int>("group");
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+    int group_row = group;
+    int group_column = channel / group_row;
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    const T* output_grad_data = output_grad->data<T>();
+    for (int n = 0; n < num; ++n) {
+      for (int i = 0; i < group_row; ++i) {
+        for (int j = 0; j < group_column; ++j) {
+          const T* p_i = output_grad_data + n * feature_map_size +
+                         (i * group_column + j) * sp_sz;
+          T* p_o = input_grad_data + n * feature_map_size +
+                   (j * group_row + i) * sp_sz;
+          memcpy(p_o, p_i, sizeof(int) * sp_sz);
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -29,8 +29,14 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Xs", "A list of inputs.").AsDuplicable();
    AddOutput("Ys", "A list of outputs").AsDuplicable();
    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>("calibration_data", "the calibration data for int8");
+    AddAttr<std::string>(
+        "engine_key",
+        "The engine_key here is used to distinguish different TRT Engines");
    AddAttr<int>("max_batch_size", "the maximum batch size.");
    AddAttr<int>("workspace_size", "the workspace size.");
+    AddAttr<framework::BlockDesc *>("sub_block", "the trt block");
+    AddAttr<bool>("enable_int8", "whether swith to int8 mode");
    AddComment("TensorRT engine operator.");
  }
 };
@@ -47,6 +53,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
-                  ops::TensorRTEngineOpMaker);
+                  ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
 #endif  // PADDLE_WITH_CUDA
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -17,8 +17,10 @@
 #ifdef PADDLE_WITH_CUDA
 #include <string>
+#include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
@@ -62,6 +64,9 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
 using inference::Singleton;
 using inference::tensorrt::TensorRTEngine;
+using inference::tensorrt::TRTInt8Calibrator;
+using inference::tensorrt::TRTCalibratorEngine;
+using inference::tensorrt::TRTCalibratorEngineManager;
 class TensorRTEngineOp : public framework::OperatorBase {
 private:
@@ -70,6 +75,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
  mutable std::unique_ptr<TensorRTEngine> trt_engine_;
  int max_batch_size_;
  int workspace_size_;
+  std::unique_ptr<TRTInt8Calibrator> calibrator_;
+  bool enable_int8_;
+  std::string calibration_data_;
+  std::string engine_key_;
+  bool calibration_mode_;
 public:
  TensorRTEngineOp(const std::string &type,
@@ -80,19 +90,96 @@ class TensorRTEngineOp : public framework::OperatorBase {
    input_names_ = Inputs("Xs");
    max_batch_size_ = Attr<int>("max_batch_size");
    workspace_size_ = Attr<int>("workspace_size");
+    enable_int8_ = Attr<bool>("enable_int8");
+    calibration_data_ = Attr<std::string>("calibration_data");
+    engine_key_ = Attr<std::string>("engine_key");
    auto params = Attr<std::vector<std::string>>("parameters");
    for (const auto &param : params) {
      param_names_.insert(param);
    }
+    // calibration_mode is ture represents we need to
+    // generate the calibration table data.
+    calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0);
+    VLOG(4) << "calibration_mode: " << calibration_mode_;
+    if (enable_int8_ && calibration_data_.size()) {
+      calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
+    }
  }
 protected:
+  void RunNativeImpl(const framework::Scope &scope,
+                     const platform::Place &dev_place) const {
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>("sub_block");
+    auto *program = block->Program();
+    auto &current_scope = scope.NewScope();
+    auto ctx = executor.Prepare(*program, block->ID());
+    executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
+  }
  void RunImpl(const framework::Scope &scope,
               const platform::Place &dev_place) const override {
+    if (calibration_mode_ == true) {
+      RunCalibration(scope, dev_place);
+      return;
+    }
    RunTrt(scope, dev_place);
  }
+  void RunCalibration(const framework::Scope &scope,
+                      const platform::Place &dev_place) const {
+    // This process will builds a 32-bit trt engine, runs it on the calibration
+    // set, and records a histogram for each
+    // tensor of the distribution of activation values.
+    LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_
+                         << " is running calibration trt int8... ";
+    int runtime_batch = 1;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
+    if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) {
+      TRTCalibratorEngine *calib_res =
+          Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_);
+      std::unordered_map<std::string, size_t> calib_buffers;
+      for (auto &x : input_names_) {
+        if (param_names_.count(x)) continue;
+        auto &t =
+            inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+        calib_buffers[x] = t.memory_size();
+        auto t_shape = framework::vectorize(t.dims());
+        runtime_batch = t_shape[0];
+      }
+      calib_res->calib_.reset(new TRTInt8Calibrator(
+          calib_buffers, runtime_batch, engine_key_, dev_place));
+      calib_res->thr_.reset(new std::thread([&]() {
+        calib_res->engine_.reset(new TensorRTEngine(
+            max_batch_size_, workspace_size_, stream,
+            boost::get<platform::CUDAPlace>(dev_place).device, enable_int8_,
+            calib_res->calib_.get()));
+        VLOG(3) << "start the calib trt engine thread";
+        Prepare(scope, dev_place, calib_res->engine_.get());
+      }));
+    }
+    TRTInt8Calibrator *temp_calibrator =
+        Singleton<TRTCalibratorEngineManager>::Global()
+            .Get(engine_key_)
+            ->calib_.get();
+    std::unordered_map<std::string, void *> calib_data;
+    for (auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      calib_data.emplace(x, t.data<void>());
+    }
+    temp_calibrator->setBatch(calib_data);
+    RunNativeImpl(scope, dev_place);
+  }
  void RunTrt(const framework::Scope &scope,
              const platform::Place &dev_place) const {
    int runtime_batch = 1;
@@ -101,9 +188,10 @@ class TensorRTEngineOp : public framework::OperatorBase {
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
    if (trt_engine_.get() == nullptr) {
-      trt_engine_.reset(new TensorRTEngine(
+      trt_engine_.reset(
-          max_batch_size_, workspace_size_, stream,
+          new TensorRTEngine(max_batch_size_, workspace_size_, stream,
-          boost::get<platform::CUDAPlace>(dev_place).device));
+                             boost::get<platform::CUDAPlace>(dev_place).device,
+                             enable_int8_, calibrator_.get()));
      Prepare(scope, dev_place, trt_engine_.get());
    }
@@ -173,7 +261,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
  void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
               TensorRTEngine *engine) const {
-    VLOG(4) << "Prepare engine";
+    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+                 "kernel etc). This process may cost a lot of time.";
    framework::proto::BlockDesc block_desc;
    block_desc.ParseFromString(Attr<std::string>("subgraph"));

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -96,19 +96,20 @@ TEST(TensorRTEngineOp, manual) {
  engine_op_desc.SetType("tensorrt_engine");
  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
-  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
-                       block_->SerializeAsString());
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 1 << 20);
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
-  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
+  engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
-  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
+  engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
-                                    std::vector<std::string>({}));
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
-  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
-                                    "output_name_mapping",
+  engine_op_desc.SetAttr("output_name_mapping",
-                                    std::vector<std::string>({"z0"}));
+                         std::vector<std::string>({"z0"}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
  LOG(INFO) << "create engine op";
-  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
  LOG(INFO) << "engine_op " << engine_op.get();
  framework::Scope scope;
@@ -190,20 +191,19 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
-  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
-                       block_->SerializeAsString());
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(batch_size));
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 1 << 20);
+  engine_op_desc.SetAttr("parameters",
-  SetAttr<std::vector<std::string>>(
+                         std::vector<std::string>({"y0", "y1", "y2", "y3"}));
-      engine_op_desc.Proto(), "parameters",
+  engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
-      std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
-  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
-  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+                         std::vector<std::string>({"z3"}));
-                                    "output_name_mapping",
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
-                                    std::vector<std::string>({"z3"}));
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
-  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
  // Execute them.
  engine_op->Run(scope, place);

--- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
@@ -144,19 +144,17 @@ class CudnnCTCKernel : public framework::OpKernel<T> {
        CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size));
    T* loss_data = loss->mutable_data<T>(loss_dims, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), loss, static_cast<T>(0));
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto cudnn_func = [&](void* cudnn_workspace) {
-    auto temp_allocation =
+      CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss(
-        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+          handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data,
-            workspace_size);
+          warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
-    void* cudnn_workspace = temp_allocation->ptr();
+          loss_data, cu_grad_desc, warpctc_grad_data,
+          CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace,
-    CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss(
+          workspace_size));
-        handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data,
+    };
-        warpctc_label_lengths.data(), warpctc_logits_lengths.data(), loss_data,
+    workspace_handle.RunFunc(cudnn_func, workspace_size);
-        cu_grad_desc, warpctc_grad_data, CUDNN_CTC_LOSS_ALGO_DETERMINISTIC,
-        cu_ctcloss_desc, cudnn_workspace, workspace_size));
  }
 };

--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <cuda.h>
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
@@ -30,6 +31,34 @@ namespace platform {
  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 #endif
+inline static int RoundToPowerOfTwo(int dim) {
+  if (dim > 512) {
+    return 1024;
+  } else if (dim > 256) {
+    return 512;
+  } else if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+}
+#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
+  case (dim): {                            \
+    constexpr auto kPowerOfTwoDim = (dim); \
+    __VA_ARGS__;                           \
+  } break
+#define CUDA_LAUNCH_KERNEL_HELPER(...)         \
+  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
 template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                 int delta, int width = 32) {

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -30,8 +30,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
  auto it = device_contexts_.find(place);
  if (it == device_contexts_.end()) {
    PADDLE_THROW(
-        "'Place' is not supported, Please re-compile with WITH_GPU "
+        "Place %s is not supported, Please re-compile with WITH_GPU "
-        "option");
+        "option",
+        place);
  }
  return it->second.get().get();
 }

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #include <algorithm>
+#include <cstdlib>
+#include <string>
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -58,7 +60,18 @@ DEFINE_string(selected_gpus, "",
 namespace paddle {
 namespace platform {
-int GetCUDADeviceCount() {
+static int GetCUDADeviceCountImpl() {
+  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
+  if (cuda_visible_devices != nullptr) {
+    std::string cuda_visible_devices_str(cuda_visible_devices);
+    if (std::all_of(cuda_visible_devices_str.begin(),
+                    cuda_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be empty. No GPU detected.";
+      return 0;
+    }
+  }
  int count;
  PADDLE_ENFORCE(
      cudaGetDeviceCount(&count),
@@ -66,6 +79,11 @@ int GetCUDADeviceCount() {
  return count;
 }
+int GetCUDADeviceCount() {
+  static auto dev_cnt = GetCUDADeviceCountImpl();
+  return dev_cnt;
+}
 int GetCUDAComputeCapability(int id) {
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
  cudaDeviceProp device_prop;
@@ -203,13 +221,17 @@ size_t GpuMaxChunkSize() {
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream) {
  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
-                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
+                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
+                 "(%p -> %p, length: %d)",
+                 src, dst, static_cast<int>(count));
 }
 void GpuMemcpySync(void *dst, const void *src, size_t count,
                   enum cudaMemcpyKind kind) {
  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> "
+                 "%p, length: %d)",
+                 src, dst, static_cast<int>(count));
 }
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -15,18 +15,38 @@ limitations under the License. */
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
 namespace paddle {
 namespace pybind {
 // Bind Methods
-void BindTracer(pybind11::module *m) {
+void BindTracer(pybind11::module* m) {
  pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
      .def("__init__",
-           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
+           [](imperative::Tracer& self, framework::BlockDesc* root_block) {
             new (&self) imperative::Tracer(root_block);
           })
-      .def("trace", &imperative::Tracer::Trace)
+      .def("trace",
+           [](imperative::Tracer& self, imperative::OpBase* op,
+              const imperative::VarBasePtrMap& inputs,
+              const imperative::VarBasePtrMap& outputs,
+              framework::BlockDesc* block,
+              const platform::CPUPlace expected_place,
+              const bool stop_gradient = false) {
+             self.Trace(op, inputs, outputs, block, expected_place,
+                        stop_gradient);
+           })
+      .def("trace",
+           [](imperative::Tracer& self, imperative::OpBase* op,
+              const imperative::VarBasePtrMap& inputs,
+              const imperative::VarBasePtrMap& outputs,
+              framework::BlockDesc* block,
+              const platform::CUDAPlace expected_place,
+              const bool stop_gradient = false) {
+             self.Trace(op, inputs, outputs, block, expected_place,
+                        stop_gradient);
+           })
      .def("py_trace", &imperative::Tracer::PyTrace,
           pybind11::return_value_policy::take_ownership);
 }

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -180,8 +180,14 @@ void BindNativePredictor(py::module *m) {
 }
 void BindAnalysisConfig(py::module *m) {
-  py::class_<AnalysisConfig>(*m, "AnalysisConfig")
+  py::class_<AnalysisConfig> analysis_config(*m, "AnalysisConfig");
-      .def(py::init<const AnalysisConfig &>())
+  py::enum_<AnalysisConfig::Precision>(analysis_config, "Precision")
+      .value("Float32", AnalysisConfig::Precision::kFloat32)
+      .value("Int8", AnalysisConfig::Precision::kInt8)
+      .export_values();
+  analysis_config.def(py::init<const AnalysisConfig &>())
      .def(py::init<const std::string &>())
      .def(py::init<const std::string &, const std::string &>())
      .def("set_model", (void (AnalysisConfig::*)(const std::string &)) &
@@ -215,7 +221,8 @@ void BindAnalysisConfig(py::module *m) {
      .def("specify_input_name", &AnalysisConfig::specify_input_name)
      .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine,
           py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
-           py::arg("min_subgraph_size") = 3)
+           py::arg("min_subgraph_size") = 3,
+           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
      .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
      .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
           py::arg("x") = true)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -138,6 +138,22 @@ PYBIND11_MODULE(core, m) {
      .def("_grad_ivar",
           [](const imperative::VarBase &self) { return self.grads_; },
           py::return_value_policy::reference)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::CPUPlace &place,
+              bool blocking) {
+             std::unique_ptr<imperative::VarBase> new_var =
+                 self.NewVarBase(place, blocking);
+             return new_var.release();
+           },
+           py::return_value_policy::take_ownership)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::CUDAPlace &place,
+              bool blocking) {
+             std::unique_ptr<imperative::VarBase> new_var =
+                 self.NewVarBase(place, blocking);
+             return new_var.release();
+           },
+           py::return_value_policy::take_ownership)
      .def("value", [](const imperative::VarBase &self) { return self.var_; },
           py::return_value_policy::reference)
      .def_property(
@@ -469,6 +485,7 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::reference);
  py::class_<framework::ReaderHolder>(m, "Reader", "")
+      .def("start", &framework::ReaderHolder::Start)
      .def("reset", &framework::ReaderHolder::ResetAll);
  using LoDTensorBlockingQueue =
@@ -489,19 +506,12 @@ All parameter, weight, gradient are variables in Paddle.
      .def("is_closed", &LoDTensorBlockingQueue::IsClosed);
  m.def("init_lod_tensor_blocking_queue",
-        [](Variable &var, size_t capacity,
+        [](Variable &var,
-           const std::vector<std::vector<int64_t>> &shapes)
+           size_t capacity) -> std::shared_ptr<LoDTensorBlockingQueue> {
-            -> std::shared_ptr<LoDTensorBlockingQueue> {
+          auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
-              std::vector<DDim> dims(shapes.size());
+          holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
-              std::transform(shapes.begin(), shapes.end(), dims.begin(),
+          return holder->GetQueue();
-                             [](const std::vector<int64_t> &shape) {
+        },
-                               return make_ddim(shape);
-                             });
-              auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
-              holder->InitOnce(capacity, dims,
-                               FLAGS_reader_queue_speed_test_mode);
-              return holder->GetQueue();
-            },
        py::return_value_policy::copy);
  py::class_<Scope>(m, "_Scope", R"DOC(
@@ -626,7 +636,18 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
  py::class_<platform::CUDAPlace>(m, "CUDAPlace")
-      .def(py::init<int>())
+      .def("__init__",
+           [](platform::CUDAPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_CUDA
+             PADDLE_ENFORCE(
+                 dev_id >= 0 && dev_id < platform::GetCUDADeviceCount(),
+                 "Invalid CUDAPlace(%d), must inside [0, %d)", dev_id,
+                 platform::GetCUDADeviceCount());
+             new (&self) platform::CUDAPlace(dev_id);
+#else
+             PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
+#endif
+           })
      .def("__str__", string::to_string<const platform::CUDAPlace &>);
  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
@@ -634,7 +655,12 @@ All parameter, weight, gradient are variables in Paddle.
      .def("__str__", string::to_string<const platform::CPUPlace &>);
  py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
-      .def(py::init<>())
+      .def("__init__",
+           [](platform::CUDAPinnedPlace &) {
+#ifndef PADDLE_WITH_CUDA
+             PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version");
+#endif
+           })
      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
  py::class_<platform::Place>(m, "Place")
@@ -1005,7 +1031,7 @@ All parameter, weight, gradient are variables in Paddle.
            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.remove_unnecessary_lock_ = b;
          },
-          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC")
+          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True.)DOC")
      .def_property(
          "num_trainers",
          [](const BuildStrategy &self) { return self.num_trainers_; },

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -173,7 +173,6 @@ function cmake_gen() {
        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
        ${PYTHON_FLAGS}
        -DWITH_DSO=ON
-        -DWITH_DOC=${WITH_DOC:-OFF}
        -DWITH_GPU=${WITH_GPU:-OFF}
        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
        -DWITH_DISTRIBUTE=${distibuted_flag}
@@ -208,7 +207,6 @@ EOF
        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
        ${PYTHON_FLAGS} \
        -DWITH_DSO=ON \
-        -DWITH_DOC=${WITH_DOC:-OFF} \
        -DWITH_GPU=${WITH_GPU:-OFF} \
        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
        -DWITH_DISTRIBUTE=${distibuted_flag} \
@@ -328,7 +326,8 @@ function run_brpc_test() {
    ========================================
 EOF
        set +x
-        declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test")
+        declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test" \
+        "rpc_server_test" "varhandle_test" "collective_server_test" "brpc_serde_test")
        all_tests=`ctest -N`
        for t in "${other_tests[@]}"
@@ -527,31 +526,6 @@ function bind_test() {
    wait
 }
-function gen_docs() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    cat <<EOF
-    ========================================
-    Building documentation ...
-    In /paddle/build
-    ========================================
-EOF
-    cmake .. \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DWITH_DOC=ON \
-        -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF
-    make -j `nproc` paddle_docs paddle_apis
-    # check websites for broken links
-    linkchecker doc/v2/en/html/index.html
-    linkchecker doc/v2/cn/html/index.html
-    linkchecker doc/v2/api/en/html/index.html
-}
 function gen_doc_lib() {
    mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
@@ -563,7 +537,6 @@ function gen_doc_lib() {
 EOF
    cmake .. \
        -DCMAKE_BUILD_TYPE=Release \
-        -DWITH_DOC=ON \
        -DWITH_GPU=OFF \
        -DWITH_MKL=OFF \
        -DWITH_FLUID_ONLY=ON
@@ -802,9 +775,6 @@ function main() {
      bind_test)
        bind_test
        ;;
-      doc)
-        gen_docs
-        ;;
      gen_doc_lib)
        gen_doc_lib $2
        ;;

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,6 +22,8 @@ from . import op_frequence
 from .op_frequence import *
 from . import quantize
 from .quantize import *
+from . import reader
+from .reader import *
 from . import slim
 from .slim import *
 from . import utils
@@ -32,5 +34,6 @@ __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
+__all__ += reader.__all__
 __all__ += slim.__all__
 __all__ += utils.__all__
--- a/python/paddle/fluid/contrib/int8_inference/utility.py
+++ b/python/paddle/fluid/contrib/int8_inference/utility.py
@@ -32,10 +32,13 @@ class Calibrator(object):
    def __init__(self, *args, **kwargs):
        self.program = kwargs['program']
-        self.iterations = kwargs['iterations']
        self.pretrained_model = kwargs['pretrained_model']
-        self.debug = kwargs['debug']
+        self.debug = kwargs['debug'] if 'debug' in kwargs else False
        self.algo = kwargs['algo']
+        self.output = kwargs['output']
+        self.feed_var_names = kwargs['feed_var_names']
+        self.fetch_list = kwargs['fetch_list']
+        self.exe = kwargs['exe']
        self._conv_input_var_name = []
        self._conv_output_var_name = []
@@ -54,17 +57,38 @@ class Calibrator(object):
        self._u8_output_var = []
        self._s8_output_var = []
        self._persistable_vars = []
+        self._sampling_data = {}
-    def generate_sampling_program(self):
        self.__init_analysis()
        self.__generate_output_program()
-    def generate_quantized_data(self, sampling_data):
+    def save_int8_model(self):
-        self.__sampling(sampling_data)
+        self.__sampling(self._sampling_data)
        self.__save_scale()
        self.__update_program()
        self.__update_output_program_attr()
        self.__display_debug()
+        self.__save_offline_model()
+    def sample_data(self):
+        '''
+        Sampling the tensor data of variable.
+        '''
+        for i in self.sampling_program.list_vars():
+            if i.name in self.sampling_vars:
+                np_data = np.array(fluid.global_scope().find_var(i.name)
+                                   .get_tensor())
+                if i.name not in self._sampling_data:
+                    self._sampling_data[i.name] = []
+                self._sampling_data[i.name].append(np_data)
+    def __save_offline_model(self):
+        '''
+        Save the quantized model to the disk.
+        '''
+        fluid.io.save_inference_model(self.output, self.feed_var_names,
+                                      self.fetch_list, self.exe,
+                                      self.sampling_program)
    def __display_debug(self):
        if self.debug:

--- a/python/paddle/fluid/contrib/reader/README.md
+++ b/python/paddle/fluid/contrib/reader/README.md
+## CTR READER
+An multi-thread cpp reader that has the same interface with py_reader. It
+uses cpp multi-thread to read file and is much more faster then the Python read
+thread in py_reader.
+Currently, it support two types of file:
+ - gzip
+ - plain text file
+and two types of data format:
+ - cvs data format is :
+   * label dense_fea,dense_fea sparse_fea,sparse_fea
+ - the svm data format is :
+   * label slot1:fea_sign slot2:fea_sign slot1:fea_sign
--- a/python/paddle/fluid/contrib/reader/__init__.py
+++ b/python/paddle/fluid/contrib/reader/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+from . import ctr_reader
+__all__ = ctr_reader.__all__
--- a/python/paddle/fluid/contrib/reader/ctr_reader.py
+++ b/python/paddle/fluid/contrib/reader/ctr_reader.py
@@ -20,6 +20,8 @@ from paddle.fluid.framework import default_main_program, \
    default_startup_program, Variable
 from paddle.fluid.unique_name import generate as unique_name
+__all__ = ['ctr_reader']
 def monkey_patch_reader_methods(reader):
    def __get_reader__():
@@ -30,7 +32,11 @@ def monkey_patch_reader_methods(reader):
    def reset():
        return __get_reader__().reset()
+    def start():
+        return __get_reader__().start()
    reader.reset = reset
+    reader.start = start
    reader.stop_gradient = True
    reader.persistable = True
    return reader
@@ -44,13 +50,18 @@ def _copy_reader_var_(block, var):
    return new_var
-def ctr_reader(feed_data,
+def ctr_reader(
-               capacity,
+        feed_dict,
-               thread_num,
+        file_type,  # gzip or plain
-               batch_size,
+        file_format,  # csv or svm
-               file_list,
+        dense_slot_index,
-               slots,
+        sparse_slot_index,
-               name=None):
+        capacity,
+        thread_num,
+        batch_size,
+        file_list,
+        slots,
+        name=None):
    """
    Create a CTR reader for data feeding in Python
@@ -67,12 +78,21 @@ def ctr_reader(feed_data,
    Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
    Args:
+       feed_dict(list(variable)): a list of data variable.
+       file_type('gzip'|'plain'): the type of the data file
+       file_format('csv'|'svm'): csv data or svm data format.
+        cvs data format is :
+            label dense_fea,dense_fea sparse_fea,sparse_fea
+        the svm data format is :
+            label slot1:fea_sign slot2:fea_sign slot1:fea_sign
+       dense_slot_index(list(int)): the index of dense slots
+       sparse_slot_index(list(int)): the index of sparse slots
       capacity(int): The buffer capacity maintained by :code:`py_reader`.
-       thread_num(list|tuple): List of tuples which declaring data shapes.
+       thread_num(int): the thread num to read files by cpp reader.
-       batch_size(list|tuple): List of strs which declaring data type.
+       batch_size(int): batch size of data.
-       file_list(list|tuple): List of ints which declaring data lod_level.
+       file_list(list(str)): List of file names that need to read.
-       slots(bool): Whether use double buffer or not.
+       slots(list(int64)): list of slot id.
-       name(basestring): The prefix Python queue name and Reader name. None will
+       name(string): The prefix Python queue name and Reader name. None will
            be generated automatically.
    Returns:
@@ -80,7 +100,15 @@ def ctr_reader(feed_data,
    Examples:
-        1. The basic usage of :code:`py_reader` is as follows:
+        1. The basic usage of :code:`ctr_reader` is as follows:
+     .. code-block:: python
+        py_reader = fluid.contrib.ctr_reader.ctr_reader(
+          feed_dict=datas, file_type='plain', file_format='csv',
+          file_list=file_list, dense_slot_indexs=[1, 2, 3, 4], sparse_slot_indexs=[],
+          capacity=64, thread_num=20, batch_size=1000, slots=[], name='ctr_reader')
    """
    if name is None:
        queue_name = unique_name('lod_tensor_blocking_queue')
@@ -90,7 +118,7 @@ def ctr_reader(feed_data,
        reader_name = "_".join([name, "reader"])
    var = global_scope().var(queue_name)
-    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
+    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity)
    startup_blk = default_startup_program().current_block()
    reader_var = startup_blk.create_var(name=reader_name)
@@ -99,12 +127,22 @@ def ctr_reader(feed_data,
        inputs={'blocking_queue': [queue_name]},
        outputs={'Out': [reader_var]},
        attrs={
+            'use_data_config': False,
            'thread_num': thread_num,
            'batch_size': batch_size,
            'file_list': file_list,
-            'slots': slots,
+            'file_type': file_type,
+            'file_format': file_format,
+            'dense_slot_index': dense_slot_index,
+            'sparse_slot_index': sparse_slot_index,
+            'sparse_slots': slots,
+            'ranks': [],
+            'lod_levels': [],
+            'shape_concat': []
        })
+    dtypes = [data.dtype for data in feed_dict]
+    reader_var.desc.set_dtypes(dtypes)
    reader_var.persistable = True
    main_prog_reader_var = _copy_reader_var_(
@@ -118,6 +156,9 @@ def ctr_reader(feed_data,
    main_blk = default_main_program().current_block()
    main_blk.append_op(
-        type='read', inputs={'Reader': [reader]}, outputs={'Out': feed_data})
+        type='read',
+        inputs={'Reader': [reader]},
+        attrs={'infer_out': False},
+        outputs={'Out': feed_dict})
    return reader
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -23,10 +23,11 @@ import argparse
 import functools
 import contextlib
 import paddle.fluid.profiler as profiler
+from paddle.dataset.common import download
 from PIL import Image, ImageEnhance
 import math
 sys.path.append('..')
-import int8_inference.utility as ut
+import int8_inference.utility as int8_utility
 random.seed(0)
 np.random.seed(0)
@@ -116,27 +117,43 @@ def val(data_dir=DATA_DIR):
    return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
-class TestCalibration(unittest.TestCase):
+class TestCalibrationForResnet50(unittest.TestCase):
    def setUp(self):
-        # TODO(guomingz): Put the download process in the cmake.
+        self.int8_download = 'int8/download'
-        # Download and unzip test data set
+        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
-        imagenet_dl_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz'
+                                               self.int8_download)
-        zip_file_name = imagenet_dl_url.split('/')[-1]
-        cmd = 'rm -rf data {}  && mkdir data && wget {} && tar xvf {} -C data'.format(
+        data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz'
-            zip_file_name, imagenet_dl_url, zip_file_name)
+        data_md5 = '1b6c1c434172cca1bf9ba1e4d7a3157d'
-        os.system(cmd)
+        self.data_cache_folder = self.download_data(data_url, data_md5, "data")
-        # resnet50 fp32 data
-        resnet50_fp32_model_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
+        # reader/decorator.py requires the relative path to the data folder
-        resnet50_zip_name = resnet50_fp32_model_url.split('/')[-1]
+        cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data",
-        resnet50_unzip_folder_name = 'resnet50_fp32'
+                                                   self.data_cache_folder)
-        cmd = 'rm -rf {} {} && mkdir {} && wget {} && tar xvf {} -C {}'.format(
-            resnet50_unzip_folder_name, resnet50_zip_name,
-            resnet50_unzip_folder_name, resnet50_fp32_model_url,
-            resnet50_zip_name, resnet50_unzip_folder_name)
        os.system(cmd)
-        self.iterations = 100
+        self.iterations = 50
-        self.skip_batch_num = 5
+    def cache_unzipping(self, target_folder, zip_path):
+        if not os.path.exists(target_folder):
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
+                                                          zip_path)
+            os.system(cmd)
+    def download_data(self, data_url, data_md5, folder_name):
+        download(data_url, self.int8_download, data_md5)
+        data_cache_folder = os.path.join(self.cache_folder, folder_name)
+        file_name = data_url.split('/')[-1]
+        zip_path = os.path.join(self.cache_folder, file_name)
+        self.cache_unzipping(data_cache_folder, zip_path)
+        return data_cache_folder
+    def download_resnet50_model(self):
+        # resnet50 fp32 data
+        data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz'
+        data_md5 = '4a5194524823d9b76da6e738e1367881'
+        self.model_cache_folder = self.download_data(data_url, data_md5,
+                                                     "resnet50_fp32")
    def run_program(self, model_path, generate_int8=False, algo='direct'):
        image_shape = [3, 224, 224]
@@ -163,16 +180,15 @@ class TestCalibration(unittest.TestCase):
            print("Start calibration ...")
-            calibrator = ut.Calibrator(
+            calibrator = int8_utility.Calibrator(
                program=infer_program,
                pretrained_model=model_path,
-                iterations=100,
+                algo=algo,
-                debug=False,
+                exe=exe,
-                algo=algo)
+                output=int8_model,
+                feed_var_names=feed_dict,
-            sampling_data = {}
+                fetch_list=fetch_targets)
-            calibrator.generate_sampling_program()
        test_info = []
        cnt = 0
        for batch_id, data in enumerate(val_reader()):
@@ -192,13 +208,7 @@ class TestCalibration(unittest.TestCase):
                      feed_dict[1]: label},
                fetch_list=fetch_targets)
            if generate_int8:
-                for i in calibrator.sampling_program.list_vars():
+                calibrator.sample_data()
-                    if i.name in calibrator.sampling_vars:
-                        np_data = np.array(fluid.global_scope().find_var(i.name)
-                                           .get_tensor())
-                        if i.name not in sampling_data:
-                            sampling_data[i.name] = []
-                        sampling_data[i.name].append(np_data)
            test_info.append(np.mean(acc1) * len(data))
            cnt += len(data)
@@ -209,18 +219,35 @@ class TestCalibration(unittest.TestCase):
            break
        if generate_int8:
-            calibrator.generate_quantized_data(sampling_data)
+            calibrator.save_int8_model()
-            fluid.io.save_inference_model(int8_model, feed_dict, fetch_targets,
-                                          exe, calibrator.sampling_program)
            print(
-                "Calibration is done and the corresponding files were generated at {}".
+                "Calibration is done and the corresponding files are generated at {}".
                format(os.path.abspath("calibration_out")))
        else:
            return np.sum(test_info) / cnt
-    def test_calibration_for_resnet50(self):
+    def test_calibration(self):
-        fp32_acc1 = self.run_program("resnet50_fp32/model")
+        self.download_resnet50_model()
-        self.run_program("resnet50_fp32/model", True)
+        fp32_acc1 = self.run_program(self.model_cache_folder + "/model")
+        self.run_program(self.model_cache_folder + "/model", True)
+        int8_acc1 = self.run_program("calibration_out")
+        delta_value = np.abs(fp32_acc1 - int8_acc1)
+        self.assertLess(delta_value, 0.01)
+class TestCalibrationForMobilenetv1(TestCalibrationForResnet50):
+    def download_mobilenetv1_model(self):
+        # mobilenetv1 fp32 data
+        data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        data_md5 = '13892b0716d26443a8cdea15b3c6438b'
+        self.model_cache_folder = self.download_data(data_url, data_md5,
+                                                     "mobilenetv1_fp32")
+    def test_calibration(self):
+        self.download_mobilenetv1_model()
+        fp32_acc1 = self.run_program(self.model_cache_folder + "/model")
+        self.run_program(self.model_cache_folder + "/model", True, algo='KL')
        int8_acc1 = self.run_program("calibration_out")
        delta_value = np.abs(fp32_acc1 - int8_acc1)
        self.assertLess(delta_value, 0.01)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -70,6 +70,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 _imperative_tracer_ = None
+_imperative_current_expected_place_ = None
 def _in_imperative_mode():
@@ -80,6 +81,10 @@ def _imperative_tracer():
    return _imperative_tracer_
+def _current_expected_place():
+    return _imperative_current_expected_place_
 class NameScope(object):
    def __init__(self, name="", parent=None):
        self._children = dict()
@@ -383,8 +388,8 @@ class Variable(object):
            self._ivar.stop_gradient = stop_gradient
    def _numpy(self):
-        tensor = self._ivar.value().get_tensor()
+        new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
-        return np.array(tensor)
+        return np.array(new_ivar.value().get_tensor())
    def _backward(self):
        self._ivar._run_backward()
@@ -1311,6 +1316,7 @@ class Block(object):
    def _trace_op(self, op, stop_gradient=False):
        if _in_imperative_mode():
            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc,
+                                       _imperative_current_expected_place_,
                                       stop_gradient)
    def _insert_op(self, index, *args, **kwargs):
@@ -2502,5 +2508,18 @@ def _imperative_guard(tracer):
    global _imperative_tracer_
    tmp_trace = _imperative_tracer_
    _imperative_tracer_ = tracer
    yield
    _imperative_tracer_ = tmp_trace
+@contextlib.contextmanager
+def _imperative_place_guard(place):
+    global _imperative_current_expected_place_
+    tmp_place = _imperative_current_expected_place_
+    _imperative_current_expected_place_ = place
+    yield
+    _imperative_current_expected_place_ = tmp_place
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -25,18 +25,28 @@ def enabled():
 @contextlib.contextmanager
-def guard():
+def guard(place=None):
    train = framework.Program()
    startup = framework.Program()
    tracer = core.Tracer(train.current_block().desc)
+    if place is None:
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
    with framework.program_guard(train, startup):
        with framework.unique_name.guard():
            with framework._imperative_guard(tracer):
-                yield
+                with framework._imperative_place_guard(place):
+                    yield
 def to_variable(value, block=None):
    if isinstance(value, np.ndarray):
+        assert enabled(), "to_variable could only be called in imperative mode"
        if not block:
            block = framework.default_main_program().current_block()
        py_var = framework.Variable(
@@ -47,9 +57,7 @@ def to_variable(value, block=None):
            dtype=value.dtype)
        var = py_var._ivar.value()
        tensor = var.get_tensor()
-        tensor.set(value, core.CPUPlace())
+        tensor.set(value, framework._current_expected_place())
        return py_var
    elif isinstance(value, framework.Variable):
        return value
-    else:
-        raise ValueError("Unsupported type %s" % type(value))
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -27,6 +27,7 @@ __all__ = [
    'Conv2D',
    'Pool2D',
    'FC',
+    'BatchNorm',
 ]
@@ -55,7 +56,8 @@ class Conv2D(layers.Layer):
            param_attr=param_attr,
            bias_attr=bias_attr,
            dtype=dtype,
-            name=name)
+            name=name,
+            act=act)
        self._groups = groups
        self._stride = utils.convert_to_list(stride, 2, 'stride')
@@ -141,6 +143,7 @@ class Conv2D(layers.Layer):
            outputs={'Out': [pre_act]},
            attrs={'axis': 1})
+        # Currently, we don't support inplace in imperative mode
        return self._helper.append_activation(pre_act)
@@ -216,6 +219,7 @@ class FC(layers.Layer):
                 act=None,
                 name=None):
        super(FC, self).__init__()
        self._size = size
        self._num_flatten_dims = num_flatten_dims
        self._dtype = dtype
@@ -241,6 +245,16 @@ class FC(layers.Layer):
            dtype=self._dtype,
            is_bias=False)
+        if self._helper.bias_attr:
+            size = list([self._size])
+            self._b = self._helper.create_parameter(
+                attr=self._helper.bias_attr,
+                shape=size,
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            self._b = None
    def forward(self, input):
        tmp = self._helper.create_variable_for_type_inference(self._dtype)
        self._helper.append_op(
@@ -253,28 +267,155 @@ class FC(layers.Layer):
                "y_num_col_dims": 1
            })
-        out = self._helper.create_variable_for_type_inference(self._dtype)
+        pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
        self._helper.append_op(
            type="sum",
            inputs={"X": [tmp]},
-            outputs={"Out": out},
+            outputs={"Out": pre_bias},
            attrs={"use_mkldnn": False})
-        bias_attr = self._helper.bias_attr
+        if self._b:
-        if bias_attr:
+            pre_activation = self._helper.create_variable_for_type_inference(
-            # add bias
+                dtype=self._dtype)
-            size = list(out.shape[1:])
-            if not self._built:
-                self._b = self._helper.create_parameter(
-                    attr=bias_attr, shape=size, dtype=out.dtype, is_bias=True)
-            bias_out = self._helper.create_variable_for_type_inference(
-                dtype=out.dtype)
            self._helper.append_op(
                type='elementwise_add',
-                inputs={'X': [out],
+                inputs={'X': [pre_bias],
                        'Y': [self._b]},
-                outputs={'Out': [bias_out]},
+                outputs={'Out': [pre_activation]},
-                attrs={'axis': 1})
+                attrs={'axis': self._num_flatten_dims})
-            out = bias_out
+        else:
-        # add activation
+            pre_activation = pre_bias
-        return self._helper.append_activation(out)
+        # Currently, we don't support inplace in imperative mode
+        return self._helper.append_activation(pre_activation)
+class BatchNorm(layers.Layer):
+    def __init__(self,
+                 num_channels,
+                 act=None,
+                 is_test=False,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype=core.VarDesc.VarType.FP32,
+                 data_layout='NCHW',
+                 in_place=False,
+                 name=None,
+                 moving_mean_name=None,
+                 moving_variance_name=None,
+                 do_model_average_for_mean_and_var=False,
+                 fuse_with_relu=False,
+                 use_global_stats=False):
+        super(BatchNorm, self).__init__()
+        assert bias_attr is not False, "bias_attr should not be False in batch_norm."
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(
+            'batch_norm',
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            name=name,
+            act=act)
+        if dtype == core.VarDesc.VarType.FP16:
+            self._dtype = core.VarDesc.VarType.FP32
+        else:
+            self._dtype = dtype
+        param_shape = [num_channels]
+        # create parameter
+        self._scale = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            default_initializer=Constant(1.0))
+        # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph
+        #  # setting stop_gradient=True to reduce computation
+        #  if use_global_stats and self._helper.param_attr.learning_rate == 0.:
+        #  self._scale.stop_gradient = True
+        self._bias = self._helper.create_parameter(
+            attr=self._helper.bias_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=True)
+        # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph
+        #  # setting stop_gradient=True to reduce computation
+        #  if use_global_stats and self._helper.bias_attr.learning_rate == 0.:
+        #  self._bias.stop_gradient = True
+        self._mean = self._helper.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=do_model_average_for_mean_and_var),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+        self._variance = self._helper.create_parameter(
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=do_model_average_for_mean_and_var),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+        self._in_place = in_place
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._is_test = is_test
+        self._fuse_with_relu = fuse_with_relu
+        self._use_global_stats = use_global_stats
+    def _build_once(self, input):
+        pass
+    def forward(self, input):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
+            self._dtype)
+        self._helper.append_op(
+            type="batch_norm",
+            inputs={
+                "X": input,
+                "Scale": self._scale,
+                "Bias": self._bias,
+                "Mean": self._mean,
+                "Variance": self._variance
+            },
+            outputs={
+                "Y": batch_norm_out,
+                "MeanOut": mean_out,
+                "VarianceOut": variance_out,
+                "SavedMean": saved_mean,
+                "SavedVariance": saved_variance
+            },
+            attrs={
+                "momentum": self._momentum,
+                "epsilon": self._epsilon,
+                "is_test": self._is_test,
+                "use_mkldnn": False,
+                "fuse_with_relu": self._fuse_with_relu,
+                "use_global_stats": self._use_global_stats
+            })
+        # Currently, we don't support inplace in imperative mode
+        return self._helper.append_activation(batch_norm_out)
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -435,7 +435,10 @@ class LayerHelper(object):
        act_type = act.pop('type')
        tmp = input_var
        # NOTE(dzhwinter): some activation support inplace compution.
-        if not core.IsInplace(act_type):
+        # NOTE(minqiyang): currently, we don't support inplace in imperative mode
+        if not imperative_base.enabled() and core.IsInplace(act_type):
+            tmp = input_var
+        else:
            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
        self.append_op(
            type=act_type,

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -523,7 +523,7 @@ def _py_reader(capacity,
        double_buffer_name = "_".join([name, "double_buffer"])
    var = global_scope().var(queue_name)
-    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
+    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity)
    startup_blk = default_startup_program().current_block()
    startup_var = startup_blk.create_var(name=reader_name)

--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -321,7 +321,7 @@ def append_LARS(params_grads, learning_rate, weight_decay):
        The decayed learning rate
    Examples:
        .. code-block:: python
            learning_rate *= local_gw_ratio * sqrt(sumsq(param))
                        / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
    """

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -179,6 +179,7 @@ __all__ = [
    'merge_selected_rows',
    'get_tensor_from_selected_rows',
    'lstm',
+    'shuffle_channel',
    'py_func',
    'psroi_pool',
    'teacher_student_sigmoid_loss',
@@ -2874,7 +2875,7 @@ def batch_norm(input,
        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
    # setting stop_gradient=True to reduce computation
    if use_global_stats and helper.bias_attr.learning_rate == 0.:
-        scale.stop_gradient = True
+        bias.stop_gradient = True
    mean = helper.create_parameter(
        attr=ParamAttr(
@@ -3875,6 +3876,7 @@ def beam_search(pre_ids,
                beam_size,
                end_id,
                level=0,
+                is_accumulated=True,
                name=None):
    """
    Beam search is a classical algorithm for selecting candidate words in a
@@ -3887,14 +3889,17 @@ def beam_search(pre_ids,
    selects the top-K candidate word ids of current step from :attr:`ids`
    according to their :attr:`scores` for all source sentences, where K is
    :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
-    computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
+    computation cell. If :attr:`ids` is not set, it will be calculated out
-    the output of beam_search at previous step, they are needed for special use
+    according to :attr:`scores`. Additionally, :attr:`pre_ids` and
-    to handle ended candidate translations.
+    :attr:`pre_scores` are the output of beam_search at previous step, they
+    are needed for special use to handle ended candidate translations.
-    Note that the :attr:`scores` passed in should be accumulated scores, and
-    length penalty should be done with extra operators before calculating the
+    Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores`
-    accumulated scores if needed, also suggest finding top-K before it and
+    passed in should be accumulated scores. Else, the :attr:`scores` are
-    using the top-K candidates following.
+    considered as the straightforward scores and will be transformed to the
+    log field and accumulated the :attr:`pre_scores` in this operator.
+    Length penalty should be done with extra operators before calculating the
+    accumulated scores if needed.
    Please see the following demo for a fully beam search usage example:
@@ -3924,6 +3929,8 @@ def beam_search(pre_ids,
            describes how these candidates belong to the prefix. The paths
            linking prefixes and selected candidates are organized and reserved
            in lod.
+        is_accumulated(bool, default True): Whether the input :attr:`score` is
+             accumulated scores.
        name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.
@@ -3952,8 +3959,12 @@ def beam_search(pre_ids,
                end_id=end_id)
    """
    helper = LayerHelper('beam_search', **locals())
-    score_type = scores.dtype
+    score_type = pre_scores.dtype
-    id_type = ids.dtype
+    id_type = pre_ids.dtype
+    inputs = {"pre_ids": pre_ids, "pre_scores": pre_scores, "scores": scores}
+    if ids is not None:
+        inputs["ids"] = ids
    selected_scores = helper.create_variable_for_type_inference(
        dtype=score_type)
@@ -3961,12 +3972,7 @@ def beam_search(pre_ids,
    helper.append_op(
        type='beam_search',
-        inputs={
+        inputs=inputs,
-            'pre_ids': pre_ids,
-            'pre_scores': pre_scores,
-            'ids': ids,
-            'scores': scores,
-        },
        outputs={
            'selected_ids': selected_ids,
            'selected_scores': selected_scores,
@@ -3976,6 +3982,7 @@ def beam_search(pre_ids,
            'level': level,
            'beam_size': beam_size,
            'end_id': end_id,
+            'is_accumulated': is_accumulated,
        })
    return selected_ids, selected_scores
@@ -5146,9 +5153,9 @@ def nce(input,
        littles = []
        for i in range(custom_dist_len):
            normal_prob = custom_dist[i] * custom_dist_len
-            if normal_prob - 1.0 > 1e-4:
+            if normal_prob - 1.0 > 0:
                bigs.append((i, normal_prob))
-            elif 1.0 - normal_prob > 1e-4:
+            elif 1.0 - normal_prob > 0:
                littles.append((i, normal_prob))
            else:
                alias_probs_[i] = normal_prob
@@ -5164,9 +5171,9 @@ def nce(input,
            alias_probs_[little[0]] = little[1]
            alias_[little[0]] = big_idx
            big_left = big[1] + little[1] - 1
-            if big_left - 1.0 > 1e-4:
+            if big_left - 1.0 > 0:
                bigs.append((big_idx, big_left))
-            elif 1.0 - big_left > 1e-4:
+            elif 1.0 - big_left > 0:
                littles.append((big_idx, big_left))
            else:
                alias_probs_[big_idx] = big_left
@@ -5856,7 +5863,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
            type='increment',
            inputs={'X': [counter]},
            outputs={'Out': [counter]},
-            attrs={'step': float(step)})
+            attrs={'step': float(step)},
+            stop_gradient=True)
        counter.stop_gradient = True
    return counter
@@ -9475,7 +9483,7 @@ def teacher_student_sigmoid_loss(input,
                                by the previous operator.
        label (Variable|list):  the ground truth which is a 2-D tensor with
                                shape [N x 1], where N is the batch size.
-        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound 
+        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound
        soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
    Returns:
@@ -9639,6 +9647,79 @@ def get_tensor_from_selected_rows(x, name=None):
    return out
+def shuffle_channel(x, group, name=None):
+    """
+    **Shuffle Channel Operator**
+    This operator shuffles the channels of input x.
+    It divide the input channels in each group into :attr:`group` subgroups,
+    and obtain a new order by selecting element from every subgroup one by one.
+    Please refer to the paper
+    https://arxiv.org/pdf/1707.01083.pdf
+    .. code-block:: text
+        Given a 4-D tensor input with the shape (N, C, H, W):
+            input.shape = (1, 4, 2, 2)
+            input.data =[[[[0.1, 0.2],
+                           [0.2, 0.3]],
+                          [[0.3, 0.4],
+                           [0.4, 0.5]],
+                          [[0.5, 0.6],
+                           [0.6, 0.7]],
+                          [[0.7, 0.8],
+                           [0.8, 0.9]]]]
+            Given group: 2
+            then we get a 4-D tensor out whth the same shape of input:
+            out.shape = (1, 4, 2, 2)
+            out.data = [[[[0.1, 0.2],
+                          [0.2, 0.3]],
+                         [[0.5, 0.6],
+                          [0.6, 0.7]],
+                         [[0.3, 0.4],
+                          [0.4, 0.5]],
+                         [[0.7, 0.8],
+                          [0.8, 0.9]]]]
+    Args: 
+        x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W]
+        group(int): Indicating the conuts of subgroups, It should divide the number of channels.
+    Returns:
+        out(Variable): the channels shuffling result is a tensor variable with the 
+        same shape and same type as the input.
+    Raises:
+        ValueError: If group is not an int type variable.
+    Examples:
+        .. code-block:: python
+            input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32')
+            out = fluid.layers.shuffle_channel(x=input, group=2)
+    """
+    helper = LayerHelper("shuffle_channel", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    if not isinstance(group, int):
+        raise TypeError("group must be int type")
+    helper.append_op(
+        type="shuffle_channel",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"group": group})
+    return out
 class PyFuncRegistry(object):
    _register_funcs = []

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -382,7 +382,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
            'dtype': out.dtype,
            'value': float(value),
            'force_cpu': force_cpu or force_init_on_cpu()
-        })
+        },
+        stop_gradient=True)
    out.stop_gradient = True
    return out

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -301,10 +301,10 @@ class Optimizer(object):
            no_grad_set (set|None): set of Variables should be ignored.
            callbacks (list|None): list of callables to run when appending backward
                operator for one parameter.
        Return:
            list: list of (param, grad) pair, grad is the output of backward.
        Examples:
            See examples in `apply_gradients`.
        """
@@ -322,10 +322,10 @@ class Optimizer(object):
        Args:
            params_grads (list): list of (param, grad) pair to do optimization.
        Returns:
            list: A list of operators appended to the current program.
        Examples:
            .. code-block:: python
@@ -364,7 +364,7 @@ class Optimizer(object):
        This method combines interface `backward()` and
        `apply_gradients()` into one.
        Args:
            loss (Variable): loss variable to run optimizations.
            startup_program (Program): startup_program for initializing parameters
@@ -381,18 +381,21 @@ class Optimizer(object):
        optimize_ops = []
        if imperative_base.enabled():
            if parameter_list is not None:
-                params_grads = parameter_list
+                parameters = parameter_list
            else:
                parameters = program.global_block().all_parameters()
-                params_grads = []
-                for param in parameters:
+            params_grads = []
-                    # create gradient variable
+            for param in parameters:
-                    grad_var = Variable(
+                if param.stop_gradient:
-                        block=loss.block,
+                    continue
-                        name=param._ivar._grad_name(),
+                # create gradient variable
-                        stop_gradient=True,
+                grad_var = Variable(
-                        ivar=param._ivar._grad_ivar())
+                    block=loss.block,
-                    params_grads.append((param, grad_var))
+                    name=param._ivar._grad_name(),
+                    stop_gradient=True,
+                    ivar=param._ivar._grad_ivar())
+                params_grads.append((param, grad_var))
            with program_guard(program, startup_program):
                optimize_ops = self._create_optimization_pass(params_grads)
        else:

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -159,7 +159,7 @@ class ParallelExecutor(object):
        trainers_endpoints = main._trainers_endpoints
        if num_trainers > 1 and trainers_endpoints:
            assert num_trainers == len(
-                trainers_endpoints), "num_trainers == len(end_points)"
+                trainers_endpoints), "num_trainers == len(endpoints)"
            build_strategy.trainers_endpoints = trainers_endpoints
        # step6: get persistable_vars, places. persistable_vars

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -84,6 +84,7 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
+list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -91,6 +92,8 @@ py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
+py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
+  FLAGS_cudnn_deterministic=1)
 if(WITH_DISTRIBUTE)
    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -124,7 +124,7 @@ class TestDistRunnerBase(object):
        if args.batch_merge_repeat > 1:
            pass_builder = build_stra._finalize_strategy_and_create_passes()
            mypass = pass_builder.insert_pass(
-                len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
+                len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass")
            mypass.set("num_repeats", args.batch_merge_repeat)
        if args.update_method == "nccl2":

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -16,12 +16,17 @@ import os
 import unittest
 os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+# FIXME(zjl): It seems that this unittest fails randomly 
+# when comparing all reduce last loss and reduce last loss
+# e.g.: AssertionError: 1.0357145 != 1.0673475 within 0.01 delta
+# Disable it temporarily.
+'''
 from test_parallel_executor_mnist import TestMNIST
 class EagerDeletionTestMNIST(TestMNIST):
    pass
+'''
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -67,6 +67,18 @@ class MLP(fluid.imperative.Layer):
 class TestImperative(unittest.TestCase):
+    def test_sum_op(self):
+        x = np.ones([2, 2], np.float32)
+        with fluid.imperative.guard():
+            inputs = []
+            for _ in range(10):
+                inputs.append(fluid.imperative.base.to_variable(x))
+            ret = fluid.layers.sums(inputs)
+            loss = fluid.layers.reduce_sum(ret)
+            loss._backward()
+            self.assertTrue(np.allclose(ret._numpy(), x * 10))
+            self.assertTrue(np.allclose(inputs[0]._gradient(), x))
    def test_layer(self):
        with fluid.imperative.guard():
            cl = core.Layer()
@@ -133,7 +145,8 @@ class TestImperative(unittest.TestCase):
            x = fluid.layers.reduce_sum(fluid.layers.tanh(x1))
            param_grads = fluid.backward.append_backward(
                x, parameter_list=[x1.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
            static_out, static_grad = exe.run(
                feed={inp.name: np_inp},
@@ -160,7 +173,8 @@ class TestImperative(unittest.TestCase):
            x = l(inp)[0]
            param_grads = fluid.backward.append_backward(
                x, parameter_list=[l._x_for_debug.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
            static_out, static_grad = exe.run(
                feed={inp.name: np_inp},
@@ -186,7 +200,8 @@ class TestImperative(unittest.TestCase):
            out = mlp(inp)
            param_grads = fluid.backward.append_backward(
                out, parameter_list=[mlp._fc1._w.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
            exe.run(fluid.default_startup_program())
            static_out, static_grad = exe.run(

--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -20,6 +20,7 @@ import sys
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
@@ -58,7 +59,7 @@ class Generator(fluid.imperative.Layer):
 class TestImperativeMnist(unittest.TestCase):
-    def test_mnist_cpu_float32(self):
+    def test_gan_float32(self):
        seed = 90
        startup = fluid.Program()
@@ -115,7 +116,8 @@ class TestImperativeMnist(unittest.TestCase):
            sgd = SGDOptimizer(learning_rate=1e-3)
            sgd.minimize(g_loss)
-        exe = fluid.Executor(fluid.CPUPlace())
+        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0))
        static_params = dict()
        with fluid.scope_guard(scope):
            img = np.ones([2, 1], np.float32)

--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -145,7 +145,8 @@ class TestImperativeMnist(unittest.TestCase):
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
            mnist = MNIST()
            sgd = SGDOptimizer(learning_rate=1e-3)

--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import unittest
+import numpy as np
+import six
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC
+from paddle.fluid.imperative.base import to_variable
+from test_imperative_base import new_program_scope
+batch_size = 8
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": batch_size,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    },
+    "batch_size": batch_size,
+    "lr": 0.1,
+    "total_images": 1281164,
+}
+def optimizer_setting(params):
+    ls = params["learning_strategy"]
+    if ls["name"] == "piecewise_decay":
+        if "total_images" not in params:
+            total_images = 1281167
+        else:
+            total_images = params["total_images"]
+        batch_size = ls["batch_size"]
+        step = int(total_images / batch_size + 1)
+        bd = [step * e for e in ls["epochs"]]
+        base_lr = params["lr"]
+        lr = []
+        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+        # TODO(minqiyang): Add learning rate scheduler support to imperative mode
+        #  optimizer = fluid.optimizer.Momentum(
+    #  learning_rate=params["lr"],
+    #  learning_rate=fluid.layers.piecewise_decay(
+    #  boundaries=bd, values=lr),
+    #  momentum=0.9,
+    #  regularization=fluid.regularizer.L2Decay(1e-4))
+    return optimizer
+class ConvBNLayer(fluid.imperative.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=None)
+        self._batch_norm = BatchNorm(num_filters, act=act)
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+class BottleneckBlock(fluid.imperative.Layer):
+    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride)
+        self.shortcut = shortcut
+        self._num_channels_out = num_filters * 4
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = fluid.layers.elementwise_add(x=short, y=conv2)
+        layer_helper = LayerHelper('elementwise_add_activation', act='relu')
+        return layer_helper.append_activation(y)
+class ResNet(fluid.imperative.Layer):
+    def __init__(self, layers=50, class_dim=102):
+        super(ResNet, self).__init__()
+        self.layers = layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+        self.conv = ConvBNLayer(
+            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+        self.pool2d_max = Pool2D(
+            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+        self.bottleneck_block_list = []
+        num_channels = 64
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = BottleneckBlock(
+                    num_channels=num_channels,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    shortcut=shortcut)
+                num_channels = bottleneck_block._num_channels_out
+                self.bottleneck_block_list.append(bottleneck_block)
+                shortcut = True
+        self.pool2d_avg = Pool2D(
+            pool_size=7, pool_type='avg', global_pooling=True)
+        import math
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self.out = FC(size=class_dim,
+                      act='softmax',
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.Uniform(-stdv, stdv)))
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        y = self.pool2d_max(y)
+        for bottleneck_block in self.bottleneck_block_list:
+            y = bottleneck_block(y)
+        y = self.pool2d_avg(y)
+        y = self.out(y)
+        return y
+class TestImperativeResnet(unittest.TestCase):
+    def test_resnet_float32(self):
+        seed = 90
+        batch_size = train_parameters["batch_size"]
+        batch_num = 1
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            resnet = ResNet()
+            optimizer = optimizer_setting(train_parameters)
+            np.random.seed(seed)
+            import random
+            random.seed = seed
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
+            dy_param_init_value = {}
+            for param in fluid.default_main_program().global_block(
+            ).all_parameters():
+                dy_param_init_value[param.name] = param._numpy()
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= batch_num:
+                    break
+                dy_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    batch_size, 1)
+                img = to_variable(dy_x_data)
+                label = to_variable(y_data)
+                label._stop_gradient = True
+                out = resnet(img)
+                loss = fluid.layers.cross_entropy(input=out, label=label)
+                avg_loss = fluid.layers.mean(x=loss)
+                dy_out = avg_loss._numpy()
+                if batch_id == 0:
+                    for param in fluid.default_main_program().global_block(
+                    ).all_parameters():
+                        if param.name not in dy_param_init_value:
+                            dy_param_init_value[param.name] = param._numpy()
+                avg_loss._backward()
+                dy_grad_value = {}
+                for param in fluid.default_main_program().global_block(
+                ).all_parameters():
+                    if not param.stop_gradient:
+                        np_array = np.array(param._ivar._grad_ivar().value()
+                                            .get_tensor())
+                        dy_grad_value[param.name + core.grad_var_suffix(
+                        )] = np_array
+                optimizer.minimize(avg_loss)
+                dy_param_value = {}
+                for param in fluid.default_main_program().global_block(
+                ).all_parameters():
+                    dy_param_value[param.name] = param._numpy()
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            resnet = ResNet()
+            optimizer = optimizer_setting(train_parameters)
+            np.random.seed(seed)
+            import random
+            random.seed = seed
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
+            img = fluid.layers.data(
+                name='pixel', shape=[3, 224, 224], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            out = resnet(img)
+            loss = fluid.layers.cross_entropy(input=out, label=label)
+            avg_loss = fluid.layers.mean(x=loss)
+            optimizer.minimize(avg_loss)
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            static_grad_name_list = []
+            for param in fluid.default_startup_program().global_block(
+            ).all_parameters():
+                static_param_name_list.append(param.name)
+            for param in fluid.default_main_program().global_block(
+            ).all_parameters():
+                if not param.stop_gradient:
+                    static_grad_name_list.append(param.name +
+                                                 core.grad_var_suffix())
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= batch_num:
+                    break
+                static_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    [batch_size, 1])
+                fetch_list = [avg_loss.name]
+                fetch_list.extend(static_param_name_list)
+                fetch_list.extend(static_grad_name_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed={"pixel": static_x_data,
+                                    "label": y_data},
+                              fetch_list=fetch_list)
+                static_param_value = {}
+                static_grad_value = {}
+                static_out = out[0]
+                param_start_pos = 1
+                grad_start_pos = len(static_param_name_list) + param_start_pos
+                for i in range(param_start_pos,
+                               len(static_param_name_list) + param_start_pos):
+                    static_param_value[static_param_name_list[
+                        i - param_start_pos]] = out[i]
+                for i in range(grad_start_pos,
+                               len(static_grad_name_list) + grad_start_pos):
+                    static_grad_value[static_grad_name_list[
+                        i - grad_start_pos]] = out[i]
+        self.assertTrue(np.allclose(static_out, dy_out))
+        self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+        self.assertEqual(len(dy_grad_value), len(static_grad_value))
+        for key, value in six.iteritems(static_grad_value):
+            self.assertTrue(np.allclose(value, dy_grad_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+        self.assertEqual(len(dy_param_value), len(static_param_value))
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1023,6 +1023,14 @@ class TestBook(unittest.TestCase):
        print(str(program))
+    def test_shuffle_channel(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
+            out = layers.shuffle_channel(x, group=4)
+            self.assertIsNotNone(out)
+        print(str(program))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+import paddle.fluid.core as core
+class TestShuffleChannelOp(OpTest):
+    def setUp(self):
+        self.op_type = "shuffle_channel"
+        self.batch_size = 10
+        self.input_channels = 16
+        self.layer_h = 4
+        self.layer_w = 4
+        self.group = 4
+        self.x = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_h,
+             self.layer_w)).astype('float32')
+        self.inputs = {'X': self.x}
+        self.attrs = {'group': self.group}
+        n, c, h, w = self.x.shape
+        input_reshaped = np.reshape(self.x,
+                                    (-1, self.group, c // self.group, h, w))
+        input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4))
+        self.outputs = {'Out': np.reshape(input_transposed, (-1, c, h, w))}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/transpiler/details/checkport.py
+++ b/python/paddle/fluid/transpiler/details/checkport.py
@@ -16,6 +16,7 @@ import sys
 import time
 import socket
 from contextlib import closing
+from six import string_types
 def wait_server_ready(endpoints):
@@ -32,6 +33,7 @@ def wait_server_ready(endpoints):
           wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
    """
+    assert not isinstance(endpoints, string_types)
    while True:
        all_ok = True
        not_ready_endpoints = []
@@ -45,7 +47,7 @@ def wait_server_ready(endpoints):
                    all_ok = False
                    not_ready_endpoints.append(ep)
        if not all_ok:
-            sys.stderr.write("pserver not ready, wait 3 sec to retry...\n")
+            sys.stderr.write("server not ready, wait 3 sec to retry...\n")
            sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) +
                             "\n")
            sys.stderr.flush()

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -477,13 +477,16 @@ class DistributeTranspiler(object):
                         trainer_id,
                         trainers,
                         current_endpoint,
-                         startup_program=None):
+                         startup_program=None,
+                         wait_port=True):
        if not startup_program:
            startup_program = default_startup_program()
        if trainer_id >= 0:
            worker_endpoints = trainers.split(",")
            # send NCCL_ID to others or recv from trainer 0
            worker_endpoints.remove(current_endpoint)
+            if trainer_id == 0 and wait_port:
+                wait_server_ready(worker_endpoints)
            nccl_id_var = startup_program.global_block().create_var(
                name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
@@ -564,11 +567,13 @@ class DistributeTranspiler(object):
        if self.config.mode == "nccl2":
            assert (isinstance(trainers, str))
+            self.origin_program._trainers_endpoints = trainers.split(",")
            self._transpile_nccl2(
                trainer_id,
                trainers,
                current_endpoint,
-                startup_program=startup_program)
+                startup_program=startup_program,
+                wait_port=self.config.wait_port)
            return
        self.trainer_num = trainers

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -109,6 +109,7 @@ packages=['paddle',
          'paddle.fluid.contrib',
          'paddle.fluid.contrib.decoder',
          'paddle.fluid.contrib.quantize',
+          'paddle.fluid.contrib.reader',
          'paddle.fluid.contrib.slim',
          'paddle.fluid.contrib.slim.core',
          'paddle.fluid.contrib.slim.graph',