Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into quantize_transpiler_update

d94920ce · Dang Qingqing · 44791f42 · 01fda934 · d94920ce · d94920ce
46 changed file
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -62,8 +62,26 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif()

 if(WIN32)
-  # windows stupid compile option for all targets.
+  # windows header option for all targets.
  add_definitions(-D_XKEYCHECK_H)
+  # Use symbols instead of absolute path, reduce the cmake link command length. 
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@")
+  SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@")
+
+  # Specify the program to use when building static libraries
+  SET(CMAKE_C_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
+  SET(CMAKE_CXX_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
+
+  # set defination for the dll export
+  if (NOT MSVC)
+    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
+  endif(NOT MSVC)
 endif(WIN32)

 if(NOT WITH_GOLANG)

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -41,7 +41,7 @@ paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id',
 paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
 paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
-paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None))
+paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
 paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
@@ -162,14 +162,14 @@ paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, key
 paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None))
-paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
+paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'out', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None, None))
+paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -378,7 +378,7 @@ paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> Non
 paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None
 paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None
 paddle.fluid.ParamAttr.__init__ ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False))
-paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim'], varargs=None, keywords='kwargs', defaults=(None,))
+paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False))
 paddle.fluid.DataFeeder.__init__ ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DataFeeder.decorate_reader ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True))
 paddle.fluid.DataFeeder.feed ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -13,3 +13,5 @@ if(WITH_INFERENCE)
  # NOTE: please add subdirectory inference at last.
  add_subdirectory(inference)
 endif()
+
+add_subdirectory(train)
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -26,8 +26,6 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
  PADDLE_ENFORCE(graph.get());
  FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());

-  std::unordered_set<Node*> nodes2delete;
-
  GraphPatternDetector gpd;
  auto* conv_input = gpd.mutable_pattern()
                         ->NewNode("conv_relu_mkldnn_fuse/conv_input")
@@ -42,36 +40,20 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
                     Graph* g) {
    VLOG(4) << "handle ConvReLU fuse";
    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
-                              conv_relu_pattern);  // Filter
-    GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern);  // Bias
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);    // tmp
+                              conv_relu_pattern);                      // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);  // tmp
    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern);  // CONV op
    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern);  // Out
    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern);  // ReLU op

-    // Create an ConvReLU Node.
-    OpDesc desc;
-    std::string conv_relu_i_in = subgraph.at(conv_input)->Name();
-    std::string conv_relu_w_in = conv_weight->Name();
-    std::string conv_relu_b_in = conv_bias->Name();
-    std::string conv_relu_out = relu_out->Name();
-    desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
-    desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
-    desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
-    desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
-    desc.SetType("conv2d");
-    for (auto& attr : conv->Op()->GetAttrMap()) {
-      desc.SetAttr(attr.first, attr.second);
-    }
-    desc.SetAttr("fuse_relu", true);
-    auto conv_relu_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out});
+    // Transform Conv node into ConvReLU node.
+    OpDesc* desc = conv->Op();
+    desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));
+    desc->SetAttr("fuse_relu", true);
+    GraphSafeRemoveNodes(graph.get(), {relu, conv_out});

    PADDLE_ENFORCE(subgraph.count(conv_input));
-    IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node);
-    IR_NODE_LINK_TO(conv_weight, conv_relu_node);
-    IR_NODE_LINK_TO(conv_bias, conv_relu_node);
-    IR_NODE_LINK_TO(conv_relu_node, relu_out);
+    IR_NODE_LINK_TO(conv, relu_out);

    found_conv_relu_count++;
  };

--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -85,16 +85,13 @@ TEST(ConvReLUFusePass, basic) {

  for (auto* node : graph->Nodes()) {
    if (node->IsOp() && node->Op()->Type() == "conv2d") {
-      if (node->Op()->HasAttr("use_mkldnn")) {
-        bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
-        if (use_mkldnn) {
-          if (node->Op()->HasAttr("fuse_relu")) {
-            bool fuse_relu = boost::get<bool>(node->Op()->GetAttr("fuse_relu"));
-            if (fuse_relu) {
-              ++conv_relu_count;
-            }
-          }
-        }
+      auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
+      ASSERT_TRUE(op->HasAttr("fuse_relu"));
+      bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
+      if (fuse_relu) {
+        ++conv_relu_count;
      }
    }
  }

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -638,11 +638,6 @@ PDNode *patterns::ConvReLU::operator()(
                              ->AsInput()
                              ->assert_is_persistable_var()
                              ->assert_is_op_input("conv2d", "Filter");
-  // Bias
-  auto *conv_bias_var = pattern->NewNode(conv_bias_repr())
-                            ->AsInput()
-                            ->assert_is_persistable_var()
-                            ->assert_is_op_input("conv2d", "Bias");
  // intermediate variable, will be removed in the IR after fuse.
  auto *conv_out_var = pattern->NewNode(conv_out_repr())
                           ->AsIntermediate()
@@ -653,8 +648,7 @@ PDNode *patterns::ConvReLU::operator()(
                           ->AsOutput()
                           ->assert_is_op_output("relu");

-  conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var})
-      .LinksTo({conv_out_var});
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
  relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
  return relu_out_var;
 }

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -379,7 +379,7 @@ struct PatternBase {
 // op: conv + relu
 // named nodes:
 // conv_input, conv_weight,
-// conv_bias, conv_out, conv,
+// conv_out, conv,
 // relu_out, relu
 struct ConvReLU : public PatternBase {
  ConvReLU(PDPattern* pattern, const std::string& name_scope)
@@ -392,7 +392,6 @@ struct ConvReLU : public PatternBase {
  PATTERN_DECL_NODE(relu);
  // declare variable node's name
  PATTERN_DECL_NODE(conv_weight);
-  PATTERN_DECL_NODE(conv_bias);
  PATTERN_DECL_NODE(conv_out);
  PATTERN_DECL_NODE(relu_out);
 };

--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -38,27 +38,31 @@ struct OpInfo {
  OpAttrChecker* checker_{nullptr};
  InferVarTypeFN infer_var_type_;
  InferShapeFN infer_shape_;
+  std::string op_type_;

  bool HasOpProtoAndChecker() const {
    return proto_ != nullptr && checker_ != nullptr;
  }

  const proto::OpProto& Proto() const {
-    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
+    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator %s Proto has not been registered",
+                            op_type_);
    PADDLE_ENFORCE(proto_->IsInitialized(),
-                   "Operator Proto must be initialized in op info");
+                   "Operator %s Proto must be initialized in op info",
+                   op_type_);
    return *proto_;
  }

  const OpCreator& Creator() const {
-    PADDLE_ENFORCE_NOT_NULL(creator_,
-                            "Operator Creator has not been registered");
+    PADDLE_ENFORCE_NOT_NULL(
+        creator_, "Operator %s Creator has not been registered", op_type_);
    return creator_;
  }

  const GradOpMakerFN& GradOpMaker() const {
    PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
-                            "Operator GradOpMaker has not been registered.");
+                            "Operator %s GradOpMaker has not been registered.",
+                            op_type_);
    return grad_op_maker_;
  }

@@ -73,8 +77,9 @@ class OpInfoMap {
    return map_.find(op_type) != map_.end();
  }

-  void Insert(const std::string& type, const OpInfo& info) {
+  void Insert(const std::string& type, OpInfo info) {
    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    info.op_type_ = type;
    map_.insert({type, info});
  }


--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -132,7 +132,9 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,

  AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
      .SetDefault("");
-
+  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
+                                    "Callstack for Op Creatation.")
+      .SetDefault({});
  Validate();
 }


--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -46,6 +46,7 @@ class OpProtoAndCheckerMaker {
  static const char *OpRoleAttrName() { return "op_role"; }
  static const char *OpRoleVarAttrName() { return "op_role_var"; }
  static const char *OpNamescopeAttrName() { return "op_namescope"; }
+  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }

  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);


--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -14,15 +14,17 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #define GOOGLE_GLOG_DLL_DECL

+#include "paddle/fluid/framework/operator.h"
 #include <gflags/gflags.h>
 #include <glog/logging.h>
-
 #include <algorithm>
-
+#include <sstream>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -140,19 +142,48 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }

 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(4) << place << " " << DebugStringEx(&scope);
-  if (platform::is_gpu_place(place)) {
+  try {
+    if (VLOG_IS_ON(4)) {
+      VLOG(4) << place << " " << DebugStringEx(&scope);
+    }
+    if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-    PADDLE_THROW("Cannot run operator on place %s", place);
+      PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-    platform::SetDeviceId(dev_id);
+      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+      platform::SetDeviceId(dev_id);
 #endif
+    }
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+    RunImpl(scope, place);
+    if (VLOG_IS_ON(3)) {
+      VLOG(3) << place << " " << DebugStringEx(&scope);
+    }
+  } catch (platform::EnforceNotMet exception) {
+    if (Attrs().count("sub_block") != 0) {
+      throw exception;
+    }
+
+    auto& callstack = Attr<std::vector<std::string>>(
+        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+
+    if (callstack.empty()) {
+      throw exception;
+    }
+    std::ostringstream sout;
+    sout << "Invoke operator " << Type() << " error.\n";
+    sout << "Python Callstacks: \n";
+    for (auto& line : callstack) {
+      sout << line;
+    }
+    sout << "C++ Callstacks: \n";
+    sout << exception.err_str_;
+    exception.err_str_ = sout.str();
+    throw exception;
+  } catch (...) {
+    std::rethrow_exception(std::current_exception());
  }
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  platform::RecordEvent record_event(Type(), pool.Get(place));
-  RunImpl(scope, place);
-  VLOG(3) << place << " " << DebugStringEx(&scope);
 }

 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -180,7 +211,7 @@ const std::vector<std::string>& OperatorBase::Inputs(
 }

 bool OperatorBase::HasOutputs(const std::string& name) const {
-  if (outputs_.find(name) != outputs_.end()) {
+  if (outputs_.end() != outputs_.find(name)) {
    return true;
  } else {
    return false;

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -76,10 +76,10 @@ bool AnalysisPredictor::Init(
  }

  OptimizeInferenceProgram();
-  ctx_ = executor_->Prepare(*inference_program_, 0);
  if (config_._use_mkldnn) {
    executor_->EnableMKLDNN(*inference_program_);
  }
+  ctx_ = executor_->Prepare(*inference_program_, 0);

  VLOG(5) << "to create variables";
  PADDLE_ENFORCE(scope_.get());

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -22,6 +22,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/timer.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -215,57 +216,20 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 template <typename T>
 void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
                                        PaddleTensor *output) {
-  std::vector<int> shape;
-  auto dims_i = fetch.dims();
-  auto lod = fetch.lod();
-  const T *output_ptr = fetch.data<T>();
-  auto num = fetch.numel();
-  std::vector<T> data;
-  if (0 == lod.size()) {
-    std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
-    for (int j = 0; j < dims_i.size(); ++j) {
-      shape.push_back(dims_i[j]);
-    }
-  } else {
-    // for batch detection
-    // image[0] -> output[0] shape {145, 6}
-    // image[1] -> output[1] shape {176, 6}
-    // then,
-    // the batch output shape {321, 6}
-    // the lod {{0, 145, 321}}
-    // so we should append output[0] to {176, 6}
-    size_t max_dim = 0;
-    for (size_t j = 1; j < lod[0].size(); j++) {
-      max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
-    }
-    size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
-    if (max_dim > 0) {
-      data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
-    }
-    for (size_t j = 1; j < lod[0].size(); j++) {
-      size_t start = lod[0][j - 1] * common_dim;
-      size_t end = lod[0][j] * common_dim;
-      if (end > start) {
-        std::copy(output_ptr + start, output_ptr + end,
-                  data.begin() + (j - 1) * max_dim * common_dim);
-      }
-    }
-    shape.push_back(lod[0].size() - 1);
-    shape.push_back(max_dim);
-    for (int j = 1; j < dims_i.size(); ++j) {
-      shape.push_back(dims_i[j]);
-    }
-  }
-
-  output->shape = shape;
-  auto &buffer = output->data;
-  if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
-    buffer.Resize(sizeof(T) * data.size());
-  }
-  std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size());
-  // copy LoD
-  for (const auto &level : fetch.lod()) {
-    output->lod.emplace_back(level);
+  // set shape.
+  auto shape = framework::vectorize(fetch.dims());
+  output->shape.assign(shape.begin(), shape.end());
+  // set data.
+  const T *data = fetch.data<T>();
+  int num_elems = inference::VecReduceToInt(shape);
+  output->data.Resize(num_elems * sizeof(T));
+  // The fetched tensor output by fetch op, should always in CPU memory, so just
+  // copy.
+  memcpy(output->data.data(), data, num_elems * sizeof(T));
+  // set lod
+  output->lod.clear();
+  for (auto &level : fetch.lod()) {
+    output->lod.emplace_back(level.begin(), level.end());
  }
 }


--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -74,13 +74,17 @@ template <>
 std::string to_string<std::vector<std::vector<float>>>(
    const std::vector<std::vector<std::vector<float>>> &vec);

+template <typename T>
+int VecReduceToInt(const std::vector<T> &v) {
+  return std::accumulate(v.begin(), v.end(), 1, [](T a, T b) { return a * b; });
+}
+
 template <typename T>
 static void TensorAssignData(PaddleTensor *tensor,
                             const std::vector<std::vector<T>> &data) {
  // Assign buffer
-  int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1,
-                            [](int a, int b) { return a * b; });
-  tensor->data.Resize(sizeof(T) * dim);
+  int num_elems = VecReduceToInt(tensor->shape);
+  tensor->data.Resize(sizeof(T) * num_elems);
  int c = 0;
  for (const auto &f : data) {
    for (T v : f) {
@@ -89,7 +93,7 @@ static void TensorAssignData(PaddleTensor *tensor,
  }
 }

-std::string DescribeTensor(const PaddleTensor &tensor) {
+static std::string DescribeTensor(const PaddleTensor &tensor) {
  std::stringstream os;
  os << "Tensor [" << tensor.name << "]\n";
  os << " - type: ";
@@ -113,8 +117,7 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
  os << "\n";
  os << " - data: ";

-  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
-                            [](int a, int b) { return a * b; });
+  int dim = VecReduceToInt(tensor.shape);
  for (int i = 0; i < dim; i++) {
    os << static_cast<float *>(tensor.data.data())[i] << " ";
  }
@@ -122,8 +125,8 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
  return os.str();
 }

-void PrintTime(int batch_size, int repeat, int num_threads, int tid,
-               double latency, int epoch = 1) {
+static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
+                      double latency, int epoch = 1) {
  LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
            << ", threads: " << num_threads << ", thread id: " << tid
            << ", latency: " << latency << "ms ======";

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -58,6 +58,11 @@ set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classifi
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)

+# seq_conv1
+set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
+download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
+
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})

--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+struct DataRecord {
+  std::vector<std::vector<int64_t>> title1_all, title2_all, title3_all, l1_all;
+  std::vector<std::vector<int64_t>> title1, title2, title3, l1;
+  std::vector<size_t> title1_lod, title2_lod, title3_lod, l1_lod;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  size_t num_samples;  // total number of samples
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= title1_all.size()) {
+      data.title1_all.assign(title1_all.begin() + batch_iter,
+                             title1_all.begin() + batch_end);
+      data.title2_all.assign(title2_all.begin() + batch_iter,
+                             title2_all.begin() + batch_end);
+      data.title3_all.assign(title3_all.begin() + batch_iter,
+                             title3_all.begin() + batch_end);
+      data.l1_all.assign(l1_all.begin() + batch_iter,
+                         l1_all.begin() + batch_end);
+      // Prepare LoDs
+      data.title1_lod.push_back(0);
+      data.title2_lod.push_back(0);
+      data.title3_lod.push_back(0);
+      data.l1_lod.push_back(0);
+      CHECK(!data.title1_all.empty());
+      CHECK(!data.title2_all.empty());
+      CHECK(!data.title3_all.empty());
+      CHECK(!data.l1_all.empty());
+      CHECK_EQ(data.title1_all.size(), data.title2_all.size());
+      CHECK_EQ(data.title1_all.size(), data.title3_all.size());
+      CHECK_EQ(data.title1_all.size(), data.l1_all.size());
+      for (size_t j = 0; j < data.title1_all.size(); j++) {
+        data.title1.push_back(data.title1_all[j]);
+        data.title2.push_back(data.title2_all[j]);
+        data.title3.push_back(data.title3_all[j]);
+        data.l1.push_back(data.l1_all[j]);
+        // calculate lod
+        data.title1_lod.push_back(data.title1_lod.back() +
+                                  data.title1_all[j].size());
+        data.title2_lod.push_back(data.title2_lod.back() +
+                                  data.title2_all[j].size());
+        data.title3_lod.push_back(data.title3_lod.back() +
+                                  data.title3_all[j].size());
+        data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size());
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, '\t', &data);
+      // load title1 data
+      std::vector<int64_t> title1_data;
+      split_to_int64(data[0], ' ', &title1_data);
+      // load title2 data
+      std::vector<int64_t> title2_data;
+      split_to_int64(data[1], ' ', &title2_data);
+      // load title3 data
+      std::vector<int64_t> title3_data;
+      split_to_int64(data[2], ' ', &title3_data);
+      // load l1 data
+      std::vector<int64_t> l1_data;
+      split_to_int64(data[3], ' ', &l1_data);
+      title1_all.push_back(std::move(title1_data));
+      title2_all.push_back(std::move(title2_data));
+      title3_all.push_back(std::move(title3_data));
+      l1_all.push_back(std::move(l1_data));
+    }
+    num_samples = num_lines;
+  }
+};
+
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor title1_tensor, title2_tensor, title3_tensor, l1_tensor;
+  title1_tensor.name = "title1";
+  title2_tensor.name = "title2";
+  title3_tensor.name = "title3";
+  l1_tensor.name = "l1";
+  auto one_batch = data->NextBatch();
+  int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1];
+  title1_tensor.shape.assign({title1_size, 1});
+  title1_tensor.lod.assign({one_batch.title1_lod});
+  int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1];
+  title2_tensor.shape.assign({title2_size, 1});
+  title2_tensor.lod.assign({one_batch.title2_lod});
+  int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1];
+  title3_tensor.shape.assign({title3_size, 1});
+  title3_tensor.lod.assign({one_batch.title3_lod});
+  int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1];
+  l1_tensor.shape.assign({l1_size, 1});
+  l1_tensor.lod.assign({one_batch.l1_lod});
+
+  // assign data
+  TensorAssignData<int64_t>(&title1_tensor, one_batch.title1);
+  TensorAssignData<int64_t>(&title2_tensor, one_batch.title2);
+  TensorAssignData<int64_t>(&title3_tensor, one_batch.title3);
+  TensorAssignData<int64_t>(&l1_tensor, one_batch.l1);
+  // Set inputs.
+  input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::INT64;
+  }
+}
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->model_dir = FLAGS_infer_model;
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
+  }
+}
+
+// Easy for profiling independently.
+TEST(Analyzer_seq_conv1, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    // the first inference result
+    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *result = static_cast<float *>(outputs[0].data.data());
+    // output is probability, which is in (0, 1).
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_GT(result[i], 0);
+      EXPECT_LT(result[i], 1);
+    }
+  }
+}
+
+// Check the fuse status
+TEST(Analyzer_seq_conv1, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_seq_conv1, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -47,11 +47,8 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
  for (size_t i = 0; i < outputs.size(); i++) {
    auto &out = outputs[i];
    auto &ref_out = ref_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t ref_size =
-        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
-                        [](int a, int b) { return a * b; });
+    size_t size = VecReduceToInt(out.shape);
+    size_t ref_size = VecReduceToInt(ref_out.shape);
    EXPECT_GT(size, 0);
    EXPECT_EQ(size, ref_size);
    EXPECT_EQ(out.dtype, ref_out.dtype);
@@ -87,10 +84,7 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor(
  }
 }

-size_t GetSize(const PaddleTensor &out) {
-  return std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                         [](int a, int b) { return a * b; });
-}
+size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }

 std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
                                                   int *num_ops) {

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
 #include <string>
 #include "paddle/fluid/operators/mkldnn_activation_op.h"
+#include "paddle/fluid/platform/port.h"

 namespace paddle {
 namespace operators {
@@ -105,105 +106,105 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
  }
 };

-__attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC(
+UNUSED constexpr char SigmoidDoc[] = R"DOC(
 Sigmoid Activation Operator

 $$out = \frac{1}{1 + e^{-x}}$$

 )DOC";

-__attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
+UNUSED constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator

 $$out = \\log \\frac{1}{1 + e^{-x}}$$

 )DOC";

-__attribute__((unused)) constexpr char ExpDoc[] = R"DOC(
+UNUSED constexpr char ExpDoc[] = R"DOC(
 Exp Activation Operator.

 $out = e^x$

 )DOC";

-__attribute__((unused)) constexpr char ReluDoc[] = R"DOC(
+UNUSED constexpr char ReluDoc[] = R"DOC(
 Relu Activation Operator.

 $out = \max(x, 0)$

 )DOC";

-__attribute__((unused)) constexpr char TanhDoc[] = R"DOC(
+UNUSED constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.

 $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

 )DOC";

-__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
+UNUSED constexpr char TanhShrinkDoc[] = R"DOC(
 TanhShrink Activation Operator.

 $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

 )DOC";

-__attribute__((unused)) constexpr char SqrtDoc[] = R"DOC(
+UNUSED constexpr char SqrtDoc[] = R"DOC(
 Sqrt Activation Operator.

 $out = \sqrt{x}$

 )DOC";

-__attribute__((unused)) constexpr char AbsDoc[] = R"DOC(
+UNUSED constexpr char AbsDoc[] = R"DOC(
 Abs Activation Operator.

 $out = |x|$

 )DOC";

-__attribute__((unused)) constexpr char CeilDoc[] = R"DOC(
+UNUSED constexpr char CeilDoc[] = R"DOC(
 Ceil Activation Operator.

 $out = ceil(x)$

 )DOC";

-__attribute__((unused)) constexpr char FloorDoc[] = R"DOC(
+UNUSED constexpr char FloorDoc[] = R"DOC(
 Floor Activation Operator.

 $out = floor(x)$

 )DOC";

-__attribute__((unused)) constexpr char CosDoc[] = R"DOC(
+UNUSED constexpr char CosDoc[] = R"DOC(
 Cosine Activation Operator.

 $out = cos(x)$

 )DOC";

-__attribute__((unused)) constexpr char SinDoc[] = R"DOC(
+UNUSED constexpr char SinDoc[] = R"DOC(
 Sine Activation Operator.

 $out = sin(x)$

 )DOC";

-__attribute__((unused)) constexpr char RoundDoc[] = R"DOC(
+UNUSED constexpr char RoundDoc[] = R"DOC(
 Round Activation Operator.

 $out = [x]$

 )DOC";

-__attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC(
+UNUSED constexpr char ReciprocalDoc[] = R"DOC(
 Reciprocal Activation Operator.

 $$out = \\frac{1}{x}$$

 )DOC";

-__attribute__((unused)) constexpr char LogDoc[] = R"DOC(
+UNUSED constexpr char LogDoc[] = R"DOC(
 Log Activation Operator.

 $out = \ln(x)$
@@ -212,21 +213,21 @@ Natural logarithm of x.

 )DOC";

-__attribute__((unused)) constexpr char SquareDoc[] = R"DOC(
+UNUSED constexpr char SquareDoc[] = R"DOC(
 Square Activation Operator.

 $out = x^2$

 )DOC";

-__attribute__((unused)) constexpr char SoftplusDoc[] = R"DOC(
+UNUSED constexpr char SoftplusDoc[] = R"DOC(
 Softplus Activation Operator.

 $out = \ln(1 + e^{x})$

 )DOC";

-__attribute__((unused)) constexpr char SoftsignDoc[] = R"DOC(
+UNUSED constexpr char SoftsignDoc[] = R"DOC(
 Softsign Activation Operator.

 $$out = \frac{x}{1 + |x|}$$

--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -46,6 +46,25 @@ static std::string gethash(const memory::dims& input_dims,
         dims2str(paddings) + pooling_type + suffix;
 }

+static inline int ComputeCeiledOutput(int input_size, int kernel_size,
+                                      int padding, int stride) {
+  return (input_size - kernel_size + 2 * padding) / stride + 1;
+}
+
+static inline void CorrectOutputSize(
+    const std::vector<int>& src_tz, const std::vector<int>& dst_tz,
+    const std::vector<int>& kernel_size, const std::vector<int>& paddings,
+    const std::vector<int>& strides,
+    std::vector<int>& right_bot_padding) {  // NOLINT
+  for (size_t i = 0; i < right_bot_padding.size(); i++) {
+    int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i],
+                                           paddings[i], strides[i]);
+    if (desired_size != dst_tz[i + 2]) {
+      right_bot_padding[i] += strides[i];
+    }
+  }
+}
+
 template <typename T>
 class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
@@ -103,6 +122,13 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto pool_p =
        std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
    if (pool_p == nullptr) {
+      const std::vector<int>& padding_left_top(paddings);
+      std::vector<int> padding_right_bottom(paddings);
+      bool ceil_mode = ctx.Attr<bool>("ceil_mode");
+      if (ceil_mode) {
+        CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
+                          padding_right_bottom);
+      }
      auto src_md = platform::MKLDNNMemDesc(
          src_tz, platform::MKLDNNGetDataType<T>(), input_format);

@@ -114,8 +140,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                            mkldnn::memory::format::any);

      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
-          CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
-                              pooling_type, mkldnn_engine);
+          CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top,
+                              padding_right_bottom, ksize, pooling_type,
+                              mkldnn_engine, ceil_mode);

      // save pool_pd into global device context to be referred in backward path
      dev_ctx.SetBlob(key_pool_pd, pool_pd);
@@ -171,14 +198,16 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 private:
  std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc(
      const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst,
-      const std::vector<int>& stride, const std::vector<int>& padding,
-      const std::vector<int>& kernel, const std::string& pooling_type,
-      const mkldnn::engine& engine) const {
+      const std::vector<int>& stride, const std::vector<int>& padding_left_top,
+      const std::vector<int>& padding_right_bot, const std::vector<int>& kernel,
+      const std::string& pooling_type, const mkldnn::engine& engine,
+      bool ceil_mode) const {
    auto pool_desc = mkldnn::pooling_forward::desc(
        mkldnn::prop_kind::forward,
        pooling_type == "max" ? mkldnn::algorithm::pooling_max
                              : mkldnn::algorithm::pooling_avg,
-        src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero);
+        src, dst, stride, kernel, padding_left_top, padding_right_bot,
+        mkldnn::padding_kind::zero);

    auto p_pool_pd =
        new mkldnn::pooling_forward::primitive_desc(pool_desc, engine);

--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -45,10 +45,12 @@ class ReadInferVarType : public framework::VarTypeInference {
    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
    auto dtypes = reader->GetDataTypes();
    PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
+    auto lod_levels = reader->GetLoDLevels();
    for (size_t i = 0; i < dtypes.size(); ++i) {
      framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
      out.SetType(framework::proto::VarType::LOD_TENSOR);
      out.SetDataType(dtypes[i]);
+      out.SetLoDLevel(lod_levels[i]);
    }
  }
 };

--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
    }

    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_LT(0, offset_data[i],
+      PADDLE_ENFORCE_LE(0, offset_data[i],
                        "The offset[%d] must greater than zero.", i);
      PADDLE_ENFORCE_LT(0, length_data[i],
                        "The length[%d] must greater than zero.", i);
-      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+      PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
                        lod[0][i + 1], "The target tensor's length overflow.");
    }


--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#define EIGEN_USE_GPU
+#include <algorithm>
 #include "paddle/fluid/operators/sgd_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"

@@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
  }
 }

-template <typename T, int block_size>
+template <typename T>
 __global__ void SparseSGDFunctorKernel(const T* selected_rows,
                                       const int64_t* rows,
                                       const T* learning_rate, T* tensor_out,
-                                       int64_t row_numel) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  selected_rows += ty * row_numel;
-  tensor_out += rows[ty] * row_numel;
-
-  for (int index = tid; index < row_numel; index += block_size) {
-    // Since index in rows of SelectedRows can be duplicate, we have to use
-    // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(
-        tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
+                                       int64_t row_numel, int64_t limit) {
+  for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
+    const T* selected_rows_ptr = selected_rows + i * row_numel;
+    T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
+    for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
+      // Since index in rows of SelectedRows can be duplicate, we have to use
+      // Atomic Operation to avoid concurrent write error.
+      paddle::platform::CudaAtomicAdd(
+          tensor_out_ptr + index,
+          -1.0 * learning_rate[0] * selected_rows_ptr[index]);
+    }
  }
 }
 }  // namespace
@@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
      auto* in_data = in_value.data<T>();
      auto* out_data = param_out->data<T>();

-      const int block_size = 256;
-      dim3 threads(block_size, 1);
-      dim3 grid(1, in_rows.size());
-      SparseSGDFunctorKernel<
-          T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+      const int kThreadsPerBlock = 256;
+      int thread_x = kThreadsPerBlock;
+      int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
+      int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+      SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
+                               ctx.cuda_device_context().stream()>>>(
          in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
-          out_data, in_row_numel);
+          out_data, in_row_numel, in_rows.size());

    } else {
      PADDLE_THROW("Unsupported Variable Type of Grad");

--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -52,16 +52,26 @@ class ShrinkRNNMemoryOp : public ArrayOp {
    size_t height = dst_num_rows;

    // do shrink for the top level LoD
+
    if (x_tensor.lod().size() > 0 &&
        x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
-      auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
-                                                              dst_num_rows, 0);
-      height = lod_offset.second.second;
-      auto out_lod = out_tensor.mutable_lod();
-      framework::AppendLoD(out_lod, lod_offset.first);
+      if (x_tensor.lod().size() > 1) {  // MultiLevel LoD
+        auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x_tensor.lod(), 0, dst_num_rows, 0);
+        height = lod_offset.second.second;
+        auto out_lod = out_tensor.mutable_lod();
+        framework::AppendLoD(out_lod, lod_offset.first);
+      } else {
+        // Shrink LoD
+        auto lod_item = x_tensor.lod()[0];
+        lod_item.resize(dst_num_rows + 1);
+        out_tensor.set_lod({lod_item});
+        const auto &const_lod_item = lod_item;
+        height = const_lod_item.back();
+      }
    }

-    if (dst_num_rows != 0) {
+    if (height != 0) {
      out_tensor.mutable_data(place, x_tensor.type());
      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
      framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
@@ -134,8 +144,11 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
    } else {
      auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
      auto height = dout_tensor.dims()[0];
-      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
-      framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
+      if (height != 0) {
+        auto slice = dx_tensor.Slice(0, static_cast<int>(height));
+        framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx,
+                              &slice);
+      }
      if (dx_tensor.dims()[0] > height) {
        auto rest_tensor = dx_tensor.Slice(
            static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));

--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -36,7 +36,7 @@ namespace operators {
 using FluidDT = framework::proto::VarType_Type;
 using TRT_DT = nvinfer1::DataType;

-namespace {
+namespace {  // NOLINT

 TRT_DT FluidDataType2TRT(FluidDT type) {
  switch (type) {

--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -30,6 +30,8 @@ class TopkOp : public framework::OperatorWithKernel {
                   "Output(Indices) of TopkOp should not be null.");

    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(input_dims.size(), 2,
+                      "Rank of TopK op's input must be 2.");
    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));

    PADDLE_ENFORCE_GE(k, 1, "k must >= 1");

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -201,6 +201,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
  compute_capability = GetCUDAComputeCapability(place_.device);
  multi_process = GetCUDAMultiProcessors(place_.device);
  max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
+  grid_max_dims_ = GpuMaxGridDim(place_.device);
  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
  eigen_stream_.reset(new EigenCudaStreamDevice());
  eigen_stream_->Reinitialize(&stream_, place);
@@ -239,6 +240,10 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
  return multi_process * max_threads_per_mp;
 }

+std::tuple<int, int, int> CUDADeviceContext::GetMaxGridDims() const {
+  return grid_max_dims_;
+}
+
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
  return eigen_device_.get();
 }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -13,6 +13,7 @@ limitations under the License. */
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
+#include <tuple>
 #include <unordered_map>
 #include <vector>

@@ -91,6 +92,8 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return the max physical thread count in the device context */
  int GetMaxPhysicalThreadCount() const;

+  std::tuple<int, int, int> GetMaxGridDims() const;
+
  /*! \brief  Return eigen device in the device context. */
  Eigen::GpuDevice* eigen_device() const;

@@ -135,6 +138,8 @@ class CUDADeviceContext : public DeviceContext {
  cudaStream_t stream_;
  cublasHandle_t cublas_handle_;

+  std::tuple<int, int, int> grid_max_dims_;
+
  int compute_capability;
  int multi_process;
  int max_threads_per_mp;

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #if defined(_WIN32)
 #define NOMINMAX  // msvc max/min macro conflict with std::min/max
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#define GOOGLE_GLOG_DLL_DECL
 #endif

 #ifdef PADDLE_WITH_CUDA
@@ -47,7 +48,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
-#if !defined(__APPLE__) and !defined(_WIN32)
+#if !defined(__APPLE__) && !defined(_WIN32)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
@@ -216,7 +217,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 #endif
 }

-#if !defined(__APPLE__) and !defined(_WIN32)
+#if !defined(__APPLE__) && !defined(_WIN32)
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
    ncclResult_t stat, const Args&... args) {
@@ -260,14 +261,8 @@ inline void throw_on_error(T e) {
    }                                                                   \
  } while (false)

-#define PADDLE_THROW_EOF()                                                     \
-  do {                                                                         \
-    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
-                                           __LINE__);                          \
-  } while (false)
-
 #else
-#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__)
+#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG

 #else  // !_WIN32
@@ -281,6 +276,12 @@ inline void throw_on_error(T e) {
 #define PADDLE_ENFORCE(x, ...) x
 #endif  // !_WIN32

+#define PADDLE_THROW_EOF()                                                     \
+  do {                                                                         \
+    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
+                                           __LINE__);                          \
+  } while (false)
+
 /*
 * Some enforce helpers here, usage:
 *    int a = 1;
@@ -294,7 +295,7 @@ inline void throw_on_error(T e) {
 *    extra messages is also supported, for example:
 *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
 */
-
+#if !defined(_WIN32)
 #define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
 #define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
@@ -307,6 +308,7 @@ inline void throw_on_error(T e) {
  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
+
 #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
  do {                                                       \
    if (UNLIKELY(nullptr == (__VAL))) {                      \
@@ -326,6 +328,27 @@ inline void throw_on_error(T e) {
                   paddle::string::Sprintf("" __VA_ARGS__));            \
    }                                                                   \
  } while (0)
+#else
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
+  do {                                                                 \
+    if (!((__VAL0)__CMP(__VAL1))) {                                    \
+      PADDLE_THROW("Windows disable the enforce. Enforce failed.");    \
+    }                                                                  \
+  } while (0)
+#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...)                       \
+  do {                                                             \
+    if (nullptr == (__VAL1)) {                                     \
+      PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
+    }                                                              \
+  } while (0)
+#endif  // !_WIN32

 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -48,35 +48,54 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
 }

 template <typename Function>
-__global__ static void ForRangeElemwiseOp(Function func, int limit) {
+__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
  if (idx < limit) {
    func(idx);
  }
 }

+template <typename Function>
+__global__ static void ForRangeElemwiseOpGridLarge(Function func, size_t limit,
+                                                   int grid_dim) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  while (idx < limit) {
+    func(idx);
+    idx += grid_dim;
+  }
+}
+
 template <>
 struct ForRange<CUDADeviceContext> {
  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
+      : dev_ctx_(dev_ctx), limit_(limit) {}

  template <typename Function>
  inline void operator()(Function func) const {
    constexpr int num_threads = 1024;
    int block_size = limit_ <= num_threads ? limit_ : num_threads;
-    int grid_size = (limit_ + num_threads - 1) / num_threads;
-
-    if (grid_size == 1) {
-      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
-          func);
+    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    int max_grid_dim = std::get<0>(dev_ctx_.GetMaxGridDims());
+
+    if (grid_size < max_grid_dim) {
+      int grid_size_int = static_cast<int>(grid_size);
+      if (grid_size == 1) {
+        ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+            func);
+      } else {
+        ForRangeElemwiseOp<<<grid_size_int, block_size, 0, dev_ctx_.stream()>>>(
+            func, limit_);
+      }
    } else {
-      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
-          func, limit_);
+      ForRangeElemwiseOpGridLarge<<<max_grid_dim, block_size, 0,
+                                    dev_ctx_.stream()>>>(func, limit_,
+                                                         max_grid_dim);
    }
  }

  const CUDADeviceContext& dev_ctx_;
-  int limit_;
+  size_t limit_;
 };

 #endif

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -152,5 +152,22 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
  PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
 }
+
+std::tuple<int, int, int> GpuMaxGridDim(int id) {
+  std::tuple<int, int, int> result;
+  PADDLE_ENFORCE(
+      cudaDeviceGetAttribute(&std::get<0>(result), cudaDevAttrMaxBlockDimX, id),
+      "cudaDeviceGetAttribute failed in "
+      "cudaDevAttrMaxBlockDim");
+  PADDLE_ENFORCE(
+      cudaDeviceGetAttribute(&std::get<1>(result), cudaDevAttrMaxBlockDimY, id),
+      "cudaDeviceGetAttribute failed in "
+      "cudaDevAttrMaxBlockDim");
+  PADDLE_ENFORCE(
+      cudaDeviceGetAttribute(&std::get<2>(result), cudaDevAttrMaxBlockDimZ, id),
+      "cudaDeviceGetAttribute failed in "
+      "cudaDevAttrMaxBlockDim");
+  return result;
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <cuda_runtime.h>
 #include <stddef.h>
 #include <string>
+#include <tuple>

 namespace paddle {
 namespace platform {
@@ -72,6 +73,8 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
 //! Set memory dst with value count size asynchronously
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);

+std::tuple<int, int, int> GpuMaxGridDim(int id);
+
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -48,6 +48,9 @@ void BindConstValue(pybind11::module* m) {
  op_proto_and_checker_maker.def(
      "kOpNameScopeAttrName",
      framework::OpProtoAndCheckerMaker::OpNamescopeAttrName);
+  op_proto_and_checker_maker.def(
+      "kOpCreationCallstackAttrName",
+      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
 }

 }  // namespace pybind

--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
+function(train_test TARGET_NAME)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+    set(arg_list "")
+    if(train_test_ARGS)
+        foreach(arg ${train_test_ARGS})
+            list(APPEND arg_list "_${arg}")
+        endforeach()
+    else()
+        list(APPEND arg_list "_")
+    endif()
+    foreach(arg ${arg_list})
+        string(REGEX REPLACE "^_$" "" arg "${arg}")
+        cc_test(test_train_${TARGET_NAME}${arg}
+                SRCS test_train_${TARGET_NAME}.cc
+                DEPS paddle_fluid_origin
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
+        set_tests_properties(test_train_${TARGET_NAME}${arg}
+                PROPERTIES DEPENDS test_${TARGET_NAME})
+    endforeach()
+endfunction(train_test)
+
+
+if(WITH_TESTING)
+  train_test(recognize_digits ARGS mlp conv)
+endif()
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <time.h>
+#include <fstream>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/place.h"
+
+DEFINE_string(dirname, "", "Directory of the train model.");
+
+namespace paddle {
+
+void Train() {
+  CHECK(!FLAGS_dirname.empty());
+  framework::InitDevices(false);
+  const auto cpu_place = platform::CPUPlace();
+  framework::Executor executor(cpu_place);
+  framework::Scope scope;
+
+  auto train_program = inference::Load(
+      &executor, &scope, FLAGS_dirname + "__model_combined__.main_program",
+      FLAGS_dirname + "__params_combined__");
+
+  std::string loss_name = "";
+  for (auto op_desc : train_program->Block(0).AllOps()) {
+    if (op_desc->Type() == "mean") {
+      loss_name = op_desc->Output("Out")[0];
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+
+  // prepare data
+  auto x_var = scope.Var("img");
+  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
+  x_tensor->Resize({64, 1, 28, 28});
+
+  auto x_data = x_tensor->mutable_data<float>(cpu_place);
+  for (int i = 0; i < 64 * 28 * 28; ++i) {
+    x_data[i] = 1.0;
+  }
+
+  auto y_var = scope.Var("label");
+  auto y_tensor = y_var->GetMutable<framework::LoDTensor>();
+  y_tensor->Resize({64, 1});
+  auto y_data = y_tensor->mutable_data<int64_t>(cpu_place);
+  for (int i = 0; i < 64 * 1; ++i) {
+    y_data[i] = static_cast<int64_t>(1);
+  }
+
+  auto loss_var = scope.Var(loss_name);
+  float first_loss = 0.0;
+  float last_loss = 0.0;
+  for (int i = 0; i < 100; ++i) {
+    executor.Run(*train_program.get(), &scope, 0, false, true);
+    if (i == 0) {
+      first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
+    } else if (i == 99) {
+      last_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
+    }
+  }
+  EXPECT_LT(last_loss, first_loss);
+}
+
+TEST(train, recognize_digits) { Train(); }
+
+}  // namespace paddle
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -147,6 +147,7 @@ function cmake_gen() {
        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
        -DPY_VERSION=${PY_VERSION:-2.7}
+        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
    ========================================
 EOF
    # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -178,7 +179,8 @@ EOF
        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-        -DPY_VERSION=${PY_VERSION:-2.7}
+        -DPY_VERSION=${PY_VERSION:-2.7} \
+        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}

 }

@@ -361,7 +363,7 @@ EOF
        ctest --output-on-failure
        # make install should also be test when unittest
        make install -j `nproc`
-        pip install /usr/local/opt/paddle/share/wheels/*.whl
+        pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
            paddle version
        fi

--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size):
            ]
            for name in names:
                for line in f.extractfile(name):
-                    line_split = line.strip().split(six.b('\t'))
+                    line = cpt.to_text(line)
+                    line_split = line.strip().split('\t')
                    if len(line_split) != 2:
                        continue
                    src_seq = line_split[0]  # one source sequence

--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang):
    word_dict = defaultdict(int)
    with tarfile.open(tar_file, mode="r") as f:
        for line in f.extractfile("wmt16/train"):
-            line_split = line.strip().split(six.b("\t"))
+            line = cpt.to_text(line)
+            line_split = line.strip().split("\t")
            if len(line_split) != 2: continue
            sen = line_split[0] if lang == "en" else line_split[1]
            for w in sen.split():
@@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):

        with tarfile.open(tar_file, mode="r") as f:
            for line in f.extractfile(file_name):
-                line_split = line.strip().split(six.b("\t"))
+                line = cpt.to_text(line)
+                line_split = line.strip().split("\t")
                if len(line_split) != 2:
                    continue
                src_words = line_split[src_col].split()

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -18,6 +18,7 @@ import collections
 import contextlib
 import re
 import six
+import traceback

 import numpy as np

@@ -34,6 +35,8 @@ except ImportError as e:
 except Exception as e:
    raise e
 from . import unique_name
+import os
+PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None

 __all__ = [
    'Program',
@@ -489,7 +492,8 @@ class OpProtoHolder(object):
        return {
            core.op_proto_and_checker_maker.kOpRoleAttrName(),
            core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
-            core.op_proto_and_checker_maker.kOpNameScopeAttrName()
+            core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
+            core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
        }


@@ -572,6 +576,11 @@ class Operator(object):
        if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
            del op_attrs[role_var_name]

+        if not PADDLE_ON_MODEL_CE:
+            callstack_var_name = op_maker.kOpCreationCallstackAttrName()
+            op_attrs[callstack_var_name] = list(
+                reversed(traceback.format_stack()))[1:]
+
        if len(self.desc.type()) != 0:
            return
        if type is None:

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -600,7 +600,7 @@ def save_inference_model(dirname,
    """
    if isinstance(feeded_var_names, six.string_types):
        feeded_var_names = [feeded_var_names]
-    else:
+    elif export_for_deployment:
        if len(feeded_var_names) > 0:
            # TODO(paddle-dev): polish these code blocks
            if not (bool(feeded_var_names) and all(
@@ -610,61 +610,60 @@ def save_inference_model(dirname,

    if isinstance(target_vars, Variable):
        target_vars = [target_vars]
-    else:
+    elif export_for_deployment:
        if not (bool(target_vars) and all(
                isinstance(var, Variable) for var in target_vars)):
            raise ValueError("'target_vars' should be a list of Variable.")

    if main_program is None:
        main_program = default_main_program()
-    copy_program = main_program.clone()
+
+    # if there is lookup table, the trainer 0 will notify all pserver to save.
+    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
+        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+        _save_lookup_tables_by_notify(executor, lookup_table_filename,
+                                      main_program._distributed_lookup_table,
+                                      main_program._endpoints)

    if not os.path.isdir(dirname):
        os.makedirs(dirname)
+    if model_filename is not None:
+        model_basename = os.path.basename(model_filename)
+    else:
+        model_basename = "__model__"
+    model_basename = os.path.join(dirname, model_basename)

    # When export_for_deployment is true, we modify the program online so that
    # it can only be loaded for inference directly. If it's false, the whole
    # original program and related meta are saved so that future usage can be
    # more flexible.
    if export_for_deployment:
-        global_block = copy_program.global_block()
+        main_program = main_program.clone()
+        global_block = main_program.global_block()
        for i, op in enumerate(global_block.ops):
            op.desc.set_is_target(False)
            if op.type == "feed" or op.type == "fetch":
                global_block._remove_op(i)
-        copy_program.desc.flush()
+        main_program.desc.flush()

-        pruned_program = copy_program._prune(targets=target_vars)
-        saved_program = pruned_program._inference_optimize(prune_read_op=True)
+        main_program = main_program._prune(targets=target_vars)
+        main_program = main_program._inference_optimize(prune_read_op=True)
        fetch_var_names = [v.name for v in target_vars]

-        prepend_feed_ops(saved_program, feeded_var_names)
-        append_fetch_ops(saved_program, fetch_var_names)
+        prepend_feed_ops(main_program, feeded_var_names)
+        append_fetch_ops(main_program, fetch_var_names)
+
+        with open(model_basename, "wb") as f:
+            f.write(main_program.desc.serialize_to_string())
    else:
        # TODO(panyx0718): Save more information so that it can also be used
        # for training and more flexible post-processing.
-        saved_program = copy_program
-
-    if model_filename is not None:
-        model_filename = os.path.basename(model_filename)
-    else:
-        model_filename = "__model__"
-    model_filename = os.path.join(dirname, model_filename)
+        with open(model_basename + ".main_program", "wb") as f:
+            f.write(main_program.desc.serialize_to_string())

    if params_filename is not None:
        params_filename = os.path.basename(params_filename)
-
-    with open(model_filename, "wb") as f:
-        f.write(saved_program.desc.serialize_to_string())
-
-    save_persistables(executor, dirname, saved_program, params_filename)
-
-    # if there is lookup table, the trainer 0 will notify all pserver to save.
-    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
-        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
-        _save_lookup_tables_by_notify(executor, lookup_table_filename,
-                                      main_program._distributed_lookup_table,
-                                      main_program._endpoints)
+    save_persistables(executor, dirname, main_program, params_filename)


 def load_inference_model(dirname,

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -311,6 +311,7 @@ def _copy_reader_var_(block, var):
    new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER)
    new_var.desc.set_shapes(var.desc.shapes())
    new_var.desc.set_dtypes(var.desc.dtypes())
+    new_var.desc.set_lod_levels(var.desc.lod_levels())
    new_var.persistable = True
    return new_var

@@ -632,6 +633,7 @@ def py_reader(capacity,
        })

    startup_var.desc.set_dtypes(dtypes)
+    startup_var.desc.set_lod_levels(lod_levels)
    startup_var.persistable = True

    main_prog_var = _copy_reader_var_(default_main_program().current_block(),

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6471,12 +6471,14 @@ def _elementwise_op(helper):
    assert y is not None, 'y cannot be None in {}'.format(op_type)
    axis = helper.kwargs.get('axis', -1)
    use_mkldnn = helper.kwargs.get('use_mkldnn', False)
-    name = helper.kwargs.get('name', None)
-    if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
+    out = helper.kwargs.get('out', None)
+    if out is None:
+        name = helper.kwargs.get('name', None)
+        if name is None:
+            out = helper.create_tmp_variable(dtype=x.dtype)
+        else:
+            out = helper.create_variable(
+                name=name, dtype=x.dtype, persistable=False)

    helper.append_op(
        type=op_type,
@@ -6489,7 +6491,13 @@ def _elementwise_op(helper):


 @templatedoc()
-def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
+def scale(x,
+          scale=1.0,
+          bias=0.0,
+          bias_after_scale=True,
+          out=None,
+          act=None,
+          name=None):
    """
    ${comment}

@@ -6498,6 +6506,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
        scale(${scale_type}): ${scale_comment}
        bias(${bias_type}): ${bias_comment}
        bias_after_scale(${bias_after_scale_type}): ${bias_after_scale_comment}
+        out(Tensor): Output tensor.
        act(basestring|None): Activation applied to the output.
        name(basestring|None): Name of the output. 

@@ -6506,11 +6515,12 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
    """

    helper = LayerHelper('scale', **locals())
-    if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
+    if out is None:
+        if name is None:
+            out = helper.create_tmp_variable(dtype=x.dtype)
+        else:
+            out = helper.create_variable(
+                name=name, dtype=x.dtype, persistable=False)

    helper.append_op(
        type='scale',
@@ -6524,31 +6534,73 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
    return helper.append_activation(out)


-def elementwise_add(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_add(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
    return _elementwise_op(LayerHelper('elementwise_add', **locals()))


-def elementwise_div(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_div(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
    return _elementwise_op(LayerHelper('elementwise_div', **locals()))


-def elementwise_sub(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_sub(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
    return _elementwise_op(LayerHelper('elementwise_sub', **locals()))


-def elementwise_mul(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_mul(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
    return _elementwise_op(LayerHelper('elementwise_mul', **locals()))


-def elementwise_max(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_max(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
    return _elementwise_op(LayerHelper('elementwise_max', **locals()))


-def elementwise_min(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_min(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
    return _elementwise_op(LayerHelper('elementwise_min', **locals()))


-def elementwise_pow(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_pow(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
    return _elementwise_op(LayerHelper('elementwise_pow', **locals()))


@@ -6560,6 +6612,7 @@ for func in [
    func.__doc__ = _generate_doc_string_(
        op_proto,
        additional_args_lines=[
+            "out (Tensor): The output tensor of elementwise op.",
            "act (basestring|None): Activation applied to the output.",
            "name (basestring|None): Name of the output."
        ])
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -74,28 +74,7 @@ class ParallelExecutor(object):
                 build_strategy=None,
                 num_trainers=1,
                 trainer_id=0,
-                 scope=None,
-                 **kwargs):
-        if len(kwargs) != 0:
-            err_msg = ""
-            for key in kwargs:
-                if key in dir(ExecutionStrategy):
-                    err_msg += \
-                        "Setting {0} by constructor is deprecated. Use " \
-                        "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \
-                        "pe=ParallelExecutor(exec_strategy=strategy) " \
-                        "instead.\n ".format(key)
-                elif key in dir(BuildStrategy):
-                    err_msg += \
-                        "Setting {0} by constructor is deprecated. Use " \
-                        "strategy=BuildStrategy(); See help(" \
-                        "paddle.fluid.ParallelExecutor.BuildStrategy) \n".format(
-                            key)
-                else:
-                    err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format(
-                        key)
-            raise ValueError(err_msg)
-
+                 scope=None):
        self._places = []
        self._act_places = []
        if use_cuda:

--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -185,7 +185,17 @@ class WeightNormParamAttr(ParamAttr):

    Args:
        dim(list): The parameter's name. Default None.
-        kwargs: Any field in ParamAttr. Default None.
+        name(str): The parameter's name. Default None.
+        initializer(Initializer): The method to initial this parameter. Default None.
+        learning_rate(float): The parameter's learning rate. The learning rate when
+            optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
+            Default 1.0.
+        regularizer(WeightDecayRegularizer): Regularization factor. Default None.
+        trainable(bool): Whether this parameter is trainable. Default True.
+        gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
+            gradient. Default None.
+        do_model_average(bool): Whether this parameter should do model average.
+            Default False.

    Examples:
        .. code-block:: python
@@ -204,6 +214,21 @@ class WeightNormParamAttr(ParamAttr):
    # these paramters for inference.
    params_with_weight_norm = []

-    def __init__(self, dim=None, **kwargs):
-        super(WeightNormParamAttr, self).__init__(**kwargs)
+    def __init__(self,
+                 dim=None,
+                 name=None,
+                 initializer=None,
+                 learning_rate=1.0,
+                 regularizer=None,
+                 trainable=True,
+                 gradient_clip=None,
+                 do_model_average=False):
+        super(WeightNormParamAttr, self).__init__(
+            name=name,
+            initializer=initializer,
+            learning_rate=learning_rate,
+            regularizer=regularizer,
+            trainable=trainable,
+            gradient_clip=gradient_clip,
+            do_model_average=do_model_average)
        self.dim = dim
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -67,6 +67,7 @@ def train(nn_type,
          use_cuda,
          parallel,
          save_dirname=None,
+          save_full_dirname=None,
          model_filename=None,
          params_filename=None,
          is_local=True):
@@ -143,6 +144,13 @@ def train(nn_type,
                                exe,
                                model_filename=model_filename,
                                params_filename=params_filename)
+                        if save_full_dirname is not None:
+                            fluid.io.save_inference_model(
+                                save_full_dirname, [], [],
+                                exe,
+                                model_filename=model_filename,
+                                params_filename=params_filename,
+                                export_for_deployment=False)
                        return
                    else:
                        print(
@@ -214,10 +222,12 @@ def infer(use_cuda,

 def main(use_cuda, parallel, nn_type, combine):
    save_dirname = None
+    save_full_dirname = None
    model_filename = None
    params_filename = None
    if not use_cuda and not parallel:
        save_dirname = "recognize_digits_" + nn_type + ".inference.model"
+        save_full_dirname = "recognize_digits_" + nn_type + ".train.model"
        if combine == True:
            model_filename = "__model_combined__"
            params_filename = "__params_combined__"
@@ -228,6 +238,7 @@ def main(use_cuda, parallel, nn_type, combine):
        use_cuda=use_cuda,
        parallel=parallel,
        save_dirname=save_dirname,
+        save_full_dirname=save_full_dirname,
        model_filename=model_filename,
        params_filename=params_filename)
    infer(

--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -661,22 +661,25 @@ class TestLoadSliceVar(TranspilerTest):

 class TestNCCL2Transpile(TranspilerTest):
    def test_nccl2_transpile(self):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            self.net_conf()
-
-        config = fluid.DistributeTranspilerConfig()
-        config.mode = "nccl2"
-        t = fluid.DistributeTranspiler(config=config)
-        t.transpile(
-            0,
-            trainers="127.0.0.1:6174,127.0.0.1:6175",
-            current_endpoint="127.0.0.1:6174",
-            startup_program=startup)
-        print([op.type for op in startup.global_block().ops])
-        self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id")
-        self.assertIsNotNone(startup.global_block().vars.get("NCCLID"))
+        if fluid.core.is_compiled_with_cuda():  #test nccl2 only with cuda
+            main = fluid.Program()
+            startup = fluid.Program()
+            with fluid.program_guard(main, startup):
+                self.net_conf()
+
+            config = fluid.DistributeTranspilerConfig()
+            config.mode = "nccl2"
+            t = fluid.DistributeTranspiler(config=config)
+            t.transpile(
+                0,
+                trainers="127.0.0.1:6174,127.0.0.1:6175",
+                current_endpoint="127.0.0.1:6174",
+                startup_program=startup)
+            print([op.type for op in startup.global_block().ops])
+            self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id")
+            self.assertIsNotNone(startup.global_block().vars.get("NCCLID"))
+        else:
+            pass


 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase):
            set(mul_op.attr_names),
            set([
                "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
-                "op_namescope"
+                "op_namescope", "op_callstack"
            ]))
        self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
        self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)