Merge branch 'sequence_enumerate_op' of https://github.com/chenwhql/Paddle...

Merge branch 'sequence_enumerate_op' of https://github.com/chenwhql/Paddle into sequence_enumerate_op

Merge branch 'sequence_enumerate_op' of https://github.com/chenwhql/Paddle...
Merge branch 'sequence_enumerate_op' of https://github.com/chenwhql/Paddle into sequence_enumerate_op
b081363b · chenweihang · 0b7d82be · b98b7440 · b081363b · b081363b
145 changed file
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -52,9 +52,8 @@ ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLML_PROJECT}
-    # Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code.
+    GIT_REPOSITORY      "https://github.com/PaddlePaddle/Anakin"
-    GIT_REPOSITORY      "https://github.com/luotao1/Anakin"
+    GIT_TAG             "9424277cf9ae180a14aff09560d3cd60a49c76d2"
-    GIT_TAG             "211d1fc5d813d70c0c14072f9083cf25f40940ea"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DUSE_GPU_PLACE=YES

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -113,6 +113,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
+paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
 paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
@@ -146,6 +147,7 @@ paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', '
 paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None))
 paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
+paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
 paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
 paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
 paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
@@ -165,6 +167,7 @@ paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, ke
 paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
 paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
+paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@@ -297,6 +300,7 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
 paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
 paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
 paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
+paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
 paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
@@ -379,7 +383,7 @@ paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, a
 paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool
 paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
 paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
-paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None  2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None  3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None  4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None  5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None  6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None  7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None  8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None  9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None  10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None  11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None  12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None  13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None  14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None  15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None  16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None  17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None  18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None  19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None  20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None  21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None
+paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None  2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None  3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None  4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None  5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None  6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None  7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None  8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None  9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None  10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None  11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None  12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None  13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None  14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None  15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None  16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None  17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None  18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None  19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None  20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None  21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None  22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None  23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None  24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None
 paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
 paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
 paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -107,11 +107,11 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
 endif()
 if (NOT WIN32)

--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -64,6 +64,7 @@ static DataTypeMap* InitDataTypeMap() {
  RegType(size_t, proto::VarType::SIZE_T);
  RegType(int16_t, proto::VarType::INT16);
  RegType(uint8_t, proto::VarType::UINT8);
+  RegType(int8_t, proto::VarType::INT8);
 #undef RegType
  return retv;

--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -54,6 +54,9 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
    case proto::VarType::INT16:
      visitor.template operator()<int16_t>();
      break;
+    case proto::VarType::INT8:
+      visitor.template operator()<int8_t>();
+      break;
    default:
      PADDLE_THROW("Not supported %d", type);
  }

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -754,17 +754,26 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                 node->Op()->Type());
  CreateComputationalOp(result, node, op_dev_id);
-  if (node->Op()->Type() == "concat") {
+}
-    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
-              "fetch_barrier");
+void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  for (ir::Node *input : node->inputs) {
+    VarHandle *var = nullptr;
+    for (int place_offset = 0; place_offset < num_places; ++place_offset) {
+      auto &var_holders = result->Get<GraphVars>(kGraphVars)[place_offset];
+      auto &var_holder = var_holders[input->Name()];
+      if (!var_holder.empty()) {
+        var = var_holder.rbegin()->get();
+        op_handle->AddInput(var);
+      }
+    }
  }
 }
 // Create RPC related op handles that connects its in ops and out ops.
 void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
                                          ir::Node *node) const {
-  // FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode
-  //                     put them into transpiler.
  int op_dev_id = -1;
  if (node->Op()->Type() == "send") {
    // TODO(paddle-dev): getting the first var is not safe.
@@ -799,8 +808,6 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
    }
    auto recv_param_grad = boost::get<std::vector<std::string>>(
        node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-    // FIXME(typhoonzero): assume each recv op output one param
-    // Use the same place as send.
    if (recv_param_grad.size() == 2U) {
      op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
      VLOG(10) << "recv param " << recv_param_grad[0]
@@ -814,34 +821,44 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
          .emplace(varname, op_dev_id);
    }
  } else {
-    // send_barrier and fetch_barrier op can be scheduled on device 0
+    // send_barrier, fetch_barrier will run on place 0;
    op_dev_id = 0;
  }
  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
                 node->Op()->Type());
  result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
      result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
      node->Op()->Type(), places_[op_dev_id]));
-  // TODO(panyx0718): This might not be needed anymore.
+  if (node->Op()->Type() == "send") {
-  if (node->Op()->Type() == "send_barrier") {
+    CreateOpHandleIOs(result, node, op_dev_id);
-    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "send");
-  } else if (node->Op()->Type() == "recv") {
-    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
-              "send_barrier");
-  } else if (node->Op()->Type() == "fetch_barrier") {
-    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "recv");
-  } else if (node->Op()->Type() == "send") {
-    // do nothing
  } else {
-    PADDLE_THROW(
+    // send_barrier, recv, fetch_barrier's inputs are deps var, get them from
-        "rpc op should be in ["
+    // all places
-        "send, send_barrier. recv, fetch_barrier]");
+    auto p = places_[op_dev_id];
-  }
+    auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+    op_handle->SetDeviceContext(p,
+                                platform::DeviceContextPool::Instance().Get(p));
-  CreateOpHandleIOs(result, node, op_dev_id);
+    SetOpInputsAllPlaces(result, node, places_.size());
+    for (ir::Node *output : node->outputs) {
+      int outvar_dev_id = op_dev_id;
+      if (node->Op()->Type() == "fetch_barrier") {
+        outvar_dev_id = GetVarDeviceID(*result, output->Name());
+        PADDLE_ENFORCE_NE(outvar_dev_id, -1);
+      }
+      p = places_[outvar_dev_id];
+      ir::Node *new_node = nullptr;
+      if (output->Var()) {
+        new_node = result->CreateVarNode(output->Var());
+      } else {
+        new_node =
+            result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
+      }
+      CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
+    }
+  }
 }
 bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -107,6 +107,7 @@ message VarType {
    // Tensor<size_t> is used in C++.
    SIZE_T = 19;
    UINT8 = 20;
+    INT8 = 21;
    // Other types that may need additional descriptions
    LOD_TENSOR = 7;

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -3,14 +3,18 @@ cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
+cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
-cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)
+cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
-cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detecter)
+cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector)
+cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
 cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass)
+cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
+cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector)
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
-cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
+cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
-cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detecter graph pass graph_traits framework_proto)
+cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
+cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto)
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/api/helper.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+struct Param {
+  std::string X = "concat_0.tmp_0";
+  std::string C0 = "cell_init";
+  std::string H0 = "hidden_init";
+  std::string AttentionWeight = "attention_fc.w_0";
+  std::string AttentionBias = "attention_fc.b_0";
+  std::string AttentionScalar = "attention_output.w_0";
+  std::string AttentionScalarBias = "attention_output.b_0";
+  std::string LSTMWeight = "attention_w.new";
+  std::string LSTMBias = "attention_b.new";
+  std::string Hidden = "array_to_lod_tensor_0.tmp_0";
+  std::string Cell = "at.cell.new";
+  std::string AttentionedX = "at.x.new";
+  std::string AttentionFCOut = "at.fc.new";
+  std::string LSTMX = "at.lstmx.new";
+  std::string LSTMOUT = "at.lstmout.new";
+};
+void PrepareParameters(Graph* graph, const Param& param);
+void FindWhileOp(Graph* graph) {
+  GraphPatternDetector gpd;
+  std::unordered_set<int> fused_external_ops(
+      {35, 36, 37, 38, 43, 44, 49, 45, 46, 47, 41, 42, 53, 54, 48,
+       57, 55, 56, 52, 74, 80, 77, 78, 79, 50, 77, 39, 40, 51});
+  gpd.mutable_pattern()->NewNode(
+      [&](Node* n) { return fused_external_ops.count(n->id()); }, "while");
+  if (!graph->Has(kGraphvizMarkedNodeAttr)) {
+    graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
+  }
+  auto& marked_nodes =
+      graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
+  auto handle = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                    Graph* g) {
+    auto* while_pat_node = gpd.pattern().RetriveNode("while");
+    auto* while_node = subgraph.at(while_pat_node);
+    marked_nodes.insert(while_node);
+  };
+  gpd(graph, handle);
+  Param param;
+  // Add AttentionLSTM node
+  OpDesc op_desc;
+  op_desc.SetType("attention_lstm");
+#define OP_SET_IN(x) op_desc.SetInput(#x, {param.x});
+#define OP_SET_OUT(x) op_desc.SetOutput(#x, {param.x});
+  OP_SET_IN(X);
+  OP_SET_IN(C0);
+  OP_SET_IN(H0);
+  OP_SET_IN(AttentionWeight);
+  OP_SET_IN(AttentionBias);
+  OP_SET_IN(AttentionScalar);
+  OP_SET_IN(AttentionScalarBias);
+  OP_SET_IN(LSTMWeight);
+  OP_SET_IN(LSTMBias);
+  OP_SET_OUT(Hidden);
+  OP_SET_OUT(Cell);
+  OP_SET_OUT(AttentionedX);
+  OP_SET_OUT(AttentionFCOut);
+  OP_SET_OUT(LSTMX);
+  OP_SET_OUT(LSTMOUT);
+#undef OP_SET_IN
+#undef OP_SET_OUT
+  auto* X = graph->RetriveNode(34);
+  auto* LSTMOUT = graph->RetriveNode(81);
+  auto* cell_init = graph->RetriveNode(6);
+  auto* hidden_init = graph->RetriveNode(8);
+#define LINK_TO(node0, node1)      \
+  node0->outputs.push_back(node1); \
+  node1->inputs.push_back(node0);
+  auto* lstm_op = graph->CreateOpNode(&op_desc);
+  PrepareParameters(graph, param);
+  LINK_TO(X, lstm_op);
+  LINK_TO(cell_init, lstm_op);
+  LINK_TO(hidden_init, lstm_op);
+  LINK_TO(lstm_op, LSTMOUT);
+  GraphSafeRemoveNodes(graph, marked_nodes);
+}
+#define CHECK_P1(x) PADDLE_ENFORCE_NOT_NULL(x);
+#define CHECK_P2(x0, x1) \
+  CHECK_P1(x0);          \
+  CHECK_P1(x1);
+#define CHECK_P3(x0, x1, x2) \
+  CHECK_P2(x0, x1);          \
+  CHECK_P1(x2);
+#define CHECK_P4(x0, x1, x2, x3) \
+  CHECK_P3(x0, x1, x2);          \
+  CHECK_P1(x3);
+#define CHECK_P5(x0, x1, x2, x3, x4) \
+  CHECK_P4(x0, x1, x2, x3);          \
+  CHECK_P1(x4);
+void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
+                       const LoDTensor& W_forget_w1,
+                       const LoDTensor& W_input_w0, const LoDTensor& W_input_w1,
+                       const LoDTensor& W_output_w0,
+                       const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0,
+                       const LoDTensor& W_cell_w1, LoDTensor* out);
+void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
+                     const LoDTensor& B_output, const LoDTensor& B_cell,
+                     LoDTensor* out);
+void PrepareParameters(Graph* graph, const Param& param) {
+  // Check parameters
+  PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+  auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+  // Create new parameters.
+  scope->Var(param.LSTMWeight)->GetMutable<LoDTensor>();
+  scope->Var(param.LSTMBias)->GetMutable<LoDTensor>();
+  scope->Var(param.Hidden)->GetMutable<LoDTensor>();
+  scope->Var(param.Cell)->GetMutable<LoDTensor>();
+  scope->Var(param.AttentionedX)->GetMutable<LoDTensor>();
+  scope->Var(param.AttentionFCOut)->GetMutable<LoDTensor>();
+  scope->Var(param.LSTMX)->GetMutable<LoDTensor>();
+  scope->Var(param.LSTMOUT)->GetMutable<LoDTensor>();
+#define GATE_W(name__)                                               \
+  auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0");            \
+  auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1");            \
+  auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0");            \
+  CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);       \
+  VLOG(4) << #name__ "_w0"                                           \
+          << " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
+  VLOG(4) << #name__ "_w1"                                           \
+          << " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
+  VLOG(4) << #name__ "_b0"                                           \
+          << " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
+  auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>();       \
+  auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>();       \
+  auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();
+  GATE_W(forget);
+  GATE_W(input);
+  GATE_W(output);
+  GATE_W(c);
+#undef GATE_W
+  auto* attention_fc_w = scope->FindVar("attention_fc.w_0");
+  auto* attention_fc_b = scope->FindVar("attention_fc.b_0");
+  auto* attention_output_w = scope->FindVar("attention_output.w_0");
+  auto* attention_output_b = scope->FindVar("attention_output.b_0");
+  CHECK_P4(attention_fc_w, attention_fc_b, attention_output_w,
+           attention_output_b);
+  auto* lstm_weight = scope->Var(param.LSTMWeight);
+  auto* lstm_weight_t = lstm_weight->GetMutable<LoDTensor>();
+  auto* lstm_bias = scope->Var(param.LSTMBias);
+  auto* lstm_bias_t = lstm_bias->GetMutable<LoDTensor>();
+  // reshape attention_bias
+  auto* attention_bias_t =
+      scope->FindVar(param.AttentionBias)->GetMutable<LoDTensor>();
+  PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(), 1);
+  attention_bias_t->Resize(make_ddim({1, attention_bias_t->dims()[0]}));
+  auto* attention_scalar_bias_t =
+      scope->FindVar(param.AttentionScalarBias)->GetMutable<LoDTensor>();
+  attention_scalar_bias_t->Resize(
+      make_ddim({1, attention_scalar_bias_t->dims()[0]}));
+  PrepareLSTMWeight(W_forget_w0_t, W_forget_w1_t, W_input_w0_t, W_input_w1_t,
+                    W_output_w0_t, W_output_w1_t, W_c_w0_t, W_c_w1_t,
+                    lstm_weight_t);
+  PrepareLSTMBias(W_forget_b0_t, W_input_b0_t, W_output_b0_t, W_c_b0_t,
+                  lstm_bias_t);
+}
+// Prepare parameters
+void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
+                       const LoDTensor& W_forget_w1,
+                       const LoDTensor& W_input_w0, const LoDTensor& W_input_w1,
+                       const LoDTensor& W_output_w0,
+                       const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0,
+                       const LoDTensor& W_cell_w1, LoDTensor* out) {
+  int D = W_forget_w0.dims()[0];
+  int M = W_forget_w1.dims()[0];
+  out->Resize(make_ddim({D + M, 4 * D}));
+  VLOG(3) << "LSTMWeight resized to " << out->dims();
+  float* out_data = out->mutable_data<float>(platform::CPUPlace());
+  std::array<const float*, 4> tensors(
+      {W_forget_w0.data<float>(), W_input_w0.data<float>(),
+       W_output_w0.data<float>(), W_cell_w0.data<float>()});
+  std::array<const float*, 4> tensors1(
+      {W_forget_w1.data<float>(), W_input_w1.data<float>(),
+       W_output_w1.data<float>(), W_cell_w1.data<float>()});
+  for (int row = 0; row < D; row++) {
+    for (int col = 0; col < 4; col++) {
+      float* dst = out_data + 4 * D * row + D * col;
+      const float* src = tensors[col] + D * row;
+      memcpy(dst, src, D * sizeof(float));
+    }
+  }
+  for (int row = 0; row < M; row++) {
+    for (int col = 0; col < 4; col++) {
+      float* dst = out_data + 4 * D * (D + row) + D * col;
+      const float* src = tensors1[col] + D * row;
+      memcpy(dst, src, D * sizeof(float));
+    }
+  }
+}
+void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
+                     const LoDTensor& B_output, const LoDTensor& B_cell,
+                     LoDTensor* out) {
+  std::array<const float*, 4> tensors(
+      {B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
+       B_cell.data<float>()});
+  PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
+  int D = B_forget.dims()[0];
+  out->Resize(make_ddim({1, 4 * D}));
+  auto* out_data = out->mutable_data<float>(platform::CPUPlace());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    memcpy(out_data + D * i, tensors[i], D * sizeof(float));
+  }
+}
+// Parameters
+std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PDPattern external_pattern, subblock_pattern;
+  FindWhileOp(graph.get());
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(attention_lstm_fuse_pass,
+              paddle::framework::ir::AttentionLSTMFusePass);
--- a/paddle/fluid/inference/analysis/dot.cc
+++ b/paddle/fluid/inference/analysis/dot.cc
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,12 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/analysis/dot.h"
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 namespace paddle {
-namespace inference {
+namespace framework {
-namespace analysis {
+namespace ir {
-size_t Dot::counter = 0;
-}  // namespace analysis
+class AttentionLSTMFusePass : public FusePassBase {
-}  // namespace inference
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -100,12 +100,10 @@ void BuildFCPattern(PDPattern* pattern) {
      },
      "elementwise_add_out");
-  pattern->AddEdge(mul_parameter_var, mul_op);
+  mul_op->LinksFrom({mul_parameter_var, mul_tmp_input_var})
-  pattern->AddEdge(mul_tmp_input_var, mul_op);
+      .LinksTo({mul_out_var});
-  pattern->AddEdge(mul_op, mul_out_var);
+  elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var})
-  pattern->AddEdge(mul_out_var, elementwise_add_op);
+      .LinksTo({elementwise_add_out_var});
-  pattern->AddEdge(elementwise_add_tmp_var, elementwise_add_op);
-  pattern->AddEdge(elementwise_add_op, elementwise_add_out_var);
 }
 // Replace the node `from` in the links to `to`
@@ -125,7 +123,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
  std::unordered_set<Node*> nodes2delete;
-  GraphPatternDetecter gpd;
+  GraphPatternDetector gpd;
  BuildFCPattern(gpd.mutable_pattern());
 #define GET_NODE(id)                                             \
@@ -134,7 +132,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
  auto* id = subgraph.at(gpd.pattern().RetriveNode(#id));        \
  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
-  auto handler = [&](const GraphPatternDetecter::subgraph_t& subgraph,
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
    VLOG(4) << "handle FC fuse";
    // Currently, there is no FC op available, so I will just simulate the

--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
 namespace paddle {

--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+  std::unordered_set<int> fused_ops({// first lstm
+                                     13, 15, 16,
+                                     // second lstm
+                                     23, 25, 26});
+  pattern->NewNode([&](Node* x) { return fused_ops.count(x->id()); },
+                   "any_node");
+  std::unordered_set<Node*> marked_nodes;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    auto* id = subgraph.at(gpd.pattern().RetriveNode("any_node"));
+    marked_nodes.insert(id);
+  };
+  gpd(graph.get(), handler);
+  // Create New OpDesc
+  auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h,
+                          int bias, int hidden, int cell, int xx) {
+#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
+    GET_NODE(input);
+    GET_NODE(weight_x);
+    GET_NODE(weight_h);
+    GET_NODE(bias);
+    GET_NODE(hidden);
+    GET_NODE(cell);
+    GET_NODE(xx);
+    GET_NODE(lstm);
+    OpDesc op_desc;
+    op_desc.SetType("fusion_lstm");
+#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()});
+    SET_IN(X, input);
+    SET_IN(WeightX, weight_x);
+    SET_IN(WeightH, weight_h);
+    SET_IN(Bias, bias);
+#undef GET_NODE
+#undef SET_IN
+    LOG(INFO) << "hidden_n: " << hidden_n->Name();
+    LOG(INFO) << "cell: " << cell_n->Name();
+    LOG(INFO) << "xx: " << xx_n->Name();
+    op_desc.SetInput("H0", {});
+    op_desc.SetInput("C0", {});
+    op_desc.SetOutput("Hidden", {hidden_n->Name()});
+    op_desc.SetOutput("Cell", {cell_n->Name()});
+    op_desc.SetOutput("XX", {xx_n->Name()});
+    op_desc.SetOutput("BatchedGate", {"blstm_0.tmp_2"});
+    op_desc.SetOutput("BatchCellPreAct", {"blstm_1.tmp_2"});
+    op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
+    op_desc.SetAttr("use_peepholes", false);
+    auto* op = graph->CreateOpNode(&op_desc);
+#define LINK_TO(a, b)      \
+  a->outputs.push_back(b); \
+  b->inputs.push_back(a);
+    LINK_TO(input_n, op);
+    LINK_TO(weight_x_n, op);
+    LINK_TO(weight_h_n, op);
+    LINK_TO(bias_n, op);
+    LINK_TO(op, hidden_n);
+#undef LINK_TO
+    return op;
+  };
+  lstm_creator(16, 12, 14, 18, 17, 22, 21, 19);
+  lstm_creator(26, 12, 24, 28, 27, 32, 31, 29);
+  // remove all the nodes
+  for (auto* node : marked_nodes) {
+    graph->RemoveNode(const_cast<Node*>(node));
+  }
+  for (auto* node : graph->Nodes()) {
+    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
+      if (marked_nodes.count(*it)) {
+        it = const_cast<Node*>(node)->inputs.erase(it);
+      } else
+        it++;
+    }
+    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
+      if (marked_nodes.count(*it)) {
+        it = const_cast<Node*>(node)->outputs.erase(it);
+      } else
+        it++;
+    }
+  }
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class FCLstmFusePass : public Pass {
+ public:
+  virtual ~FCLstmFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+static const char kParamScopeAttr[] = "param_scope";
+class FusePassBase : public Pass {
+ public:
+  void Init(Graph* graph) const { graph_ = graph; }
+  Scope* param_scope() const {
+    PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
+    return graph_->Get<framework::Scope*>(kParamScopeAttr);
+  }
+  virtual ~FusePassBase() {}
+ protected:
+  mutable Graph* graph_;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -132,63 +132,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
    }
  }
-  std::vector<ir::Node *> send_ops;
-  ir::Node *send_bar = nullptr;
-  std::vector<ir::Node *> recv_ops;
-  ir::Node *fetch_bar = nullptr;
-  for (ir::Node *node : Nodes()) {
-    if (node->Name() == "send") {
-      send_ops.push_back(node);
-    } else if (node->Name() == "send_barrier") {
-      PADDLE_ENFORCE(!send_bar, "only has one send barrier");
-      send_bar = node;
-    } else if (node->Name() == "recv") {
-      recv_ops.push_back(node);
-    } else if (node->Name() == "fetch_barrier") {
-      PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier");
-      fetch_bar = node;
-    }
-  }
-  if (send_bar) {
-    for (ir::Node *send : send_ops) {
-      ir::Node *dep_var = CreateControlDepVar();
-      send->outputs.push_back(dep_var);
-      dep_var->inputs.push_back(send);
-      send_bar->inputs.push_back(dep_var);
-      dep_var->outputs.push_back(send_bar);
-    }
-    for (ir::Node *recv : recv_ops) {
-      ir::Node *dep_var = CreateControlDepVar();
-      recv->inputs.push_back(dep_var);
-      dep_var->outputs.push_back(recv);
-      send_bar->outputs.push_back(dep_var);
-      dep_var->inputs.push_back(send_bar);
-    }
-  }
-  if (fetch_bar) {
-    for (ir::Node *recv : recv_ops) {
-      ir::Node *dep_var = CreateControlDepVar();
-      recv->outputs.push_back(dep_var);
-      dep_var->inputs.push_back(recv);
-      fetch_bar->inputs.push_back(dep_var);
-      dep_var->outputs.push_back(fetch_bar);
-    }
-  }
-  std::vector<std::string> send_vars = FindDistTrainSendVars(send_ops);
-  std::vector<std::string> recv_vars = FindDistTrainRecvVars(recv_ops);
-  for (ir::Node *node : Nodes()) {
-    if (IsDistTrainOp(node, send_vars, recv_vars)) {
-      if (fetch_bar && node->Name() == "concat") {
-        ir::Node *dep_var = CreateControlDepVar();
-        fetch_bar->outputs.push_back(dep_var);
-        dep_var->inputs.push_back(fetch_bar);
-        node->inputs.push_back(dep_var);
-        dep_var->outputs.push_back(node);
-      }
-    }
-  }
  /**
   * We should handle write after read(WAR) and write after write(WAW) here.
   * Because some of the operators of the program can be executed parallelly.

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -99,13 +99,13 @@ class Graph {
  // Create a normal variable with non-null VarDesc.
  ir::Node *CreateVarNode(VarDesc *var_desc) {
    PADDLE_ENFORCE(var_desc);
-    return AddNode(new ir::Node(var_desc));
+    return AddNode(new ir::Node(var_desc, node_count_++));
  }
  // Create a normal runnable operator with OpDesc.
  ir::Node *CreateOpNode(OpDesc *op_desc) {
    PADDLE_ENFORCE(op_desc);
-    return AddNode(new ir::Node(op_desc));
+    return AddNode(new ir::Node(op_desc, node_count_++));
  }
  // Create a control dependency var that connects 2 operations. The
@@ -115,13 +115,14 @@ class Graph {
    // TODO(panyx0718): control var name should be really unique.
    const std::string name = string::Sprintf(
        "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
-    return AddNode(new ir::Node(name, ir::Node::Type::kVariable));
+    return AddNode(
+        new ir::Node(name, ir::Node::Type::kVariable, node_count_++));
  }
  // A more free style way of creating a graph node. Mostly use for test
  // or "copy" from another node. Avoid using it if possible.
  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
-    return AddNode(new ir::Node(name, type));
+    return AddNode(new ir::Node(name, type, node_count_++));
  }
  // Clear all node information of the graph and return the ownership of the
@@ -142,12 +143,20 @@ class Graph {
    nodes_.erase(node);
  }
+  Node *RetriveNode(int id) {
+    auto it = id2node_.find(id);
+    if (it != id2node_.end()) return it->second;
+    return nullptr;
+  }
 private:
  // This method takes ownership of `node`.
  ir::Node *AddNode(ir::Node *node) {
    PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
    nodes_[node].reset(node);
    node_set_.insert(node);
+    PADDLE_ENFORCE(!id2node_.count(node->id()), "duplicate id %d", node->id());
+    id2node_[node->id()] = node;
    return node;
  }
@@ -157,6 +166,8 @@ class Graph {
  std::map<std::string, std::function<void(void)>> attr_dels_;
  std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
  std::unordered_set<ir::Node *> node_set_;
+  std::map<int, Node *> id2node_;
+  int node_count_{0};
 };
 bool IsControlDepVar(const ir::Node &var);

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -103,10 +103,10 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
    for (auto &var : n->inputs) {
      for (auto &adj_n : var->inputs) {
        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
-        adj_list[n].insert(adj_n);
        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
                << " -> " << n->Name() << reinterpret_cast<void *>(n)
                << "  via " << var->Name() << reinterpret_cast<void *>(var);
+        adj_list[n].insert(adj_n);
      }
    }
  }

--- a/paddle/fluid/framework/ir/graph_pattern_detecter.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
@@ -17,7 +17,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -34,7 +34,7 @@ PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
                      name);
  }
-  nodes_.emplace_back(new PDNode(std::move(teller), name));
+  nodes_.emplace_back(new PDNode(std::move(teller), this, name));
  auto* cur = nodes_.back().get();
  node_map_[name] = cur;
  return cur;
@@ -56,19 +56,22 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
  edges_.emplace_back(a, b);
 }
-void GraphPatternDetecter::operator()(Graph* graph,
+void GraphPatternDetector::operator()(Graph* graph,
-                                      GraphPatternDetecter::handle_t handler) {
+                                      GraphPatternDetector::handle_t handler) {
  if (!MarkPDNodesInGraph(*graph)) return;
  auto subgraphs = DetectPatterns();
  UniquePatterns(&subgraphs);
  RemoveOverlappedMatch(&subgraphs);
+  LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
+  int id = 0;
  for (auto& g : subgraphs) {
+    LOG(INFO) << "optimizing #" << id++ << " subgraph";
    handler(g, graph);
  }
 }
-bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) {
+bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
  VLOG(4) << "mark pdnodes in graph";
  if (graph.Nodes().empty()) return false;
@@ -114,13 +117,15 @@ bool IsNodesLink(Node* a, Node* b) {
  return false;
 }
-std::vector<GraphPatternDetecter::subgraph_t>
+std::vector<GraphPatternDetector::subgraph_t>
-GraphPatternDetecter::DetectPatterns() {
+GraphPatternDetector::DetectPatterns() {
  // Init empty subgraphs.
-  std::vector<GraphPatternDetecter::subgraph_t> result;
+  std::vector<GraphPatternDetector::subgraph_t> result;
  std::vector<HitGroup> init_groups;
-  PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
+  std::array<std::vector<HitGroup>, 2> bi_records;
-  auto* first_pnode = pattern_.edges().front().first;
+  // PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
+  auto* first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
+                                               : pattern_.edges().front().first;
  if (!pdnodes2nodes_.count(first_pnode)) return result;
  for (auto* node : pdnodes2nodes_[first_pnode]) {
    HitGroup group;
@@ -129,7 +134,6 @@ GraphPatternDetecter::DetectPatterns() {
  }
  int step = 0;
-  std::array<std::vector<HitGroup>, 2> bi_records;
  bi_records[0] = std::move(init_groups);
  // Extend a PDNode to subgraphs by deducing the connection relations defined
@@ -141,6 +145,7 @@ GraphPatternDetecter::DetectPatterns() {
    auto& pre_groups = bi_records[step % 2];
    auto& cur_groups = bi_records[1 - (step++ % 2)];
    cur_groups.clear();
+    if (pre_groups.empty()) break;
    // source -> target
    for (Node* source : pdnodes2nodes_[edge.first]) {
      for (Node* target : pdnodes2nodes_[edge.second]) {
@@ -163,7 +168,7 @@ GraphPatternDetecter::DetectPatterns() {
  }
  for (auto& group : bi_records[step % 2]) {
-    GraphPatternDetecter::subgraph_t subgraph;
+    GraphPatternDetector::subgraph_t subgraph;
    for (auto& role : group.roles) {
      subgraph.emplace(role.first, role.second);
    }
@@ -172,10 +177,10 @@ GraphPatternDetecter::DetectPatterns() {
  return result;
 }
-void GraphPatternDetecter::UniquePatterns(
+void GraphPatternDetector::UniquePatterns(
-    std::vector<GraphPatternDetecter::subgraph_t>* subgraphs) {
+    std::vector<GraphPatternDetector::subgraph_t>* subgraphs) {
  if (subgraphs->empty()) return;
-  std::vector<GraphPatternDetecter::subgraph_t> result;
+  std::vector<GraphPatternDetector::subgraph_t> result;
  std::unordered_set<size_t> set;
  for (auto& g : *subgraphs) {
@@ -192,7 +197,7 @@ void GraphPatternDetecter::UniquePatterns(
  *subgraphs = result;
 }
-void GraphPatternDetecter::RemoveOverlappedMatch(
+void GraphPatternDetector::RemoveOverlappedMatch(
    std::vector<subgraph_t>* subgraphs) {
  std::vector<subgraph_t> result;
  std::unordered_set<Node*> node_set;
@@ -215,6 +220,46 @@ void GraphPatternDetecter::RemoveOverlappedMatch(
  *subgraphs = result;
 }
+std::string PDPattern::DotString() const {
+  using inference::analysis::Dot;
+  Dot dot;
+  int id = 0;
+  // Create Nodes
+  std::unordered_map<PDNode*, std::string> node2dot;
+  for (const auto& node : nodes()) {
+    std::string node_id = "Node" + std::to_string(id++);
+    dot.AddNode(node_id, {}, node->name());
+    node2dot[node.get()] = node_id;
+  }
+  // Create Edges
+  for (const auto& edge : edges()) {
+    if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) {
+      LOG(ERROR) << "no node " << edge.first << " " << edge.second;
+      continue;
+    }
+    auto& src = node2dot.at(edge.first);
+    auto& trg = node2dot.at(edge.second);
+    dot.AddEdge(src, trg, {});
+  }
+  return dot.Build();
+}
+PDNode& PDNode::LinksTo(const std::vector<PDNode*>& others) {
+  // extend outlinks.
+  for (PDNode* x : others) {
+    pattern_->AddEdge(this, x);
+  }
+  return *this;
+}
+PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) {
+  // extend outlinks.
+  for (PDNode* x : others) {
+    pattern_->AddEdge(x, this);
+  }
+  return *this;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detecter.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.h
@@ -21,12 +21,14 @@
 #include <numeric>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/inference/analysis/dot.h"
 namespace paddle {
 namespace framework {
 namespace ir {
+class PDPattern;
-// Some basic torminolygies:
+// Some basic terminologies:
 //   - PDPattern: a pattern defined as a data flow graph.
 //   - PDNode: the node in the pattern, each PDNode represents an `ir::Node`
 //     that meets some conditions defined in `PDNode.teller`.
@@ -36,30 +38,43 @@ namespace ir {
 struct PDNode {
  // tell whether an ir::Node* is a candidation for a PDNode.
  using teller_t = std::function<bool(Node*)>;
+  enum class Type { kOp, kVar };
-  PDNode(teller_t&& teller, const std::string& name = "")
+  // this link to others
-      : teller_(teller), name_(name) {
+  PDNode& LinksTo(const std::vector<PDNode*>& others);
-    PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set.");
+  PDNode& LinksFrom(const std::vector<PDNode*>& others);
-  }
-  PDNode(PDNode&& other) = default;
-  std::vector<PDNode*> inlinks;
-  std::vector<PDNode*> outlinks;
  bool Tell(Node* node) const {
    PADDLE_ENFORCE(teller_ != nullptr, "teller should be set for a PDNode");
    return teller_(node);
  }
+  bool IsOp() const { return type_ == Type::kOp; }
+  bool IsVar() const { return type_ == Type::kVar; }
  const std::string& name() const { return name_; }
  PDNode(const PDNode&) = delete;
  PDNode& operator=(const PDNode&) = delete;
 private:
+  PDNode(teller_t&& teller, PDPattern* pattern, const std::string& name = "",
+         Type type = Type::kVar)
+      : teller_(std::move(teller)),
+        pattern_(pattern),
+        name_(name),
+        type_(type) {
+    PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set.");
+  }
+  PDNode(PDNode&& other) = default;
+  friend class PDPattern;
  teller_t teller_;
+  PDPattern* pattern_;
  std::string name_;
+  Type type_;
 };
 /*
@@ -102,6 +117,8 @@ class PDPattern {
  const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
  const std::vector<edge_t>& edges() const { return edges_; }
+  std::string DotString() const;
 private:
 #ifdef PADDLE_WITH_TESTING
  FRIEND_TEST(PDPattern, AddEdge);
@@ -117,7 +134,7 @@ class PDPattern {
 };
 /*
- * GraphPatternDetecter helps to detect the specific patterns in the graph.
+ * GraphPatternDetector helps to detect the specific patterns in the graph.
 * Input a pattern, output a list of the matched subgraphs/nodes.
 * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
 *
@@ -129,7 +146,7 @@ class PDPattern {
 *
 * Usage:
 *    // Create a detector
- *    GraphPatternDetecter detector;
+ *    GraphPatternDetector detector;
 *    // Define the detector's pattern, by adding PDNode and define the edges.
 *    auto* node0 = detector.mutable_pattern().AddNode(...)
 *    auto* node1 = detector.mutable_pattern().AddNode(...)
@@ -138,11 +155,11 @@ class PDPattern {
 *    detector.mutable_pattern().AddEdge(node0, node1);
 *    // Create an handler, to define the behavior of treating the filtered
 *    // subgraphs that comply with the patterns.
- *    GraphPatternDetecter::handle_t handler = some labmda
+ *    GraphPatternDetector::handle_t handler = some labmda
 *    // Execute the detector.
 *    detector(&graph, handler);
 */
-class GraphPatternDetecter {
+class GraphPatternDetector {
 public:
  using subgraph_t = std::unordered_map<PDNode*, Node*>;
@@ -177,10 +194,62 @@ class GraphPatternDetecter {
  using hit_rcd_t =
      std::pair<Node* /*node in graph*/, PDNode* /*node in pattern*/>;
  PDPattern pattern_;
-  std::vector<hit_rcd_t> marked_records_;
  std::unordered_map<const PDNode*, std::unordered_set<Node*>> pdnodes2nodes_;
 };
+// some helper methods.
+// Op's input.
+static bool VarLinksToOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->outputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+// Op's output.
+static bool VarLinksFromOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->inputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+// Check whether a var node is a op node's nth input.
+static bool IsNthInput(Node* var, Node* op, const std::string& argument,
+                       size_t nth) {
+  PADDLE_ENFORCE(var->IsVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (op->inputs.size() <= nth) return false;
+  return var->Name() == op->Op()->Input(argument)[nth];
+}
+static void GraphSafeRemoveNodes(Graph* graph,
+                                 const std::unordered_set<const Node*>& nodes) {
+  for (auto* node : nodes) {
+    graph->RemoveNode(const_cast<Node*>(node));
+  }
+  for (auto* node : graph->Nodes()) {
+    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->inputs.erase(it);
+      } else
+        it++;
+    }
+    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->outputs.erase(it);
+      } else
+        it++;
+    }
+  }
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include <gtest/gtest.h>
@@ -82,7 +82,7 @@ TEST(PDPattern, AddEdge) {
 }
 TEST(GraphPatternDetecter, MarkPDNodesInGraph) {
-  GraphPatternDetecter x;
+  GraphPatternDetector x;
  // mark o2, o3, v2
  // The pattern is a graph:
@@ -131,7 +131,7 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
  Graph graph(program);
  BuildGraph(&graph);
-  GraphPatternDetecter x;
+  GraphPatternDetector x;
  // The pattern is a graph:
  //   op -> var
@@ -149,8 +149,8 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
  x.mutable_pattern()->AddEdge(any_var, any_op1);
  int count = 0;
-  GraphPatternDetecter::handle_t handle = [&](
+  GraphPatternDetector::handle_t handle = [&](
-      const GraphPatternDetecter::subgraph_t& s, Graph* g) {
+      const GraphPatternDetector::subgraph_t& s, Graph* g) {
    LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
              << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
    count++;

--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
+#include <map>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/program_desc.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
+    std::unique_ptr<Graph> graph) const {
+  ProgramDesc& program = Get<ProgramDesc>("program");
+  std::unique_ptr<proto::ProgramDesc> program_pb(
+      new proto::ProgramDesc(*program.Proto()));
+  auto block = program_pb->mutable_blocks(kRootBlockIndex);
+  block->clear_vars();
+  std::unordered_set<std::string> visited_vars;
+  for (ir::Node* n : graph->Nodes()) {
+    if (n->NodeType() == ir::Node::Type::kVariable) {
+      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
+        visited_vars.insert(n->Var()->Name());
+        block->add_vars()->MergeFrom(*n->Var()->Proto());
+      }
+    }
+  }
+  block->clear_ops();
+  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
+  for (ir::Node* n : nodes) {
+    if (!n->Op()) {
+      continue;
+    }
+    block->add_ops()->MergeFrom(*n->Op()->Proto());
+  }
+  program.CopyFrom(*program_pb);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(graph_to_program_pass, paddle::framework::ir::GraphToProgramPass);
--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class GraphToProgramPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/program_desc.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+void BuildNoCircleGraph(Graph* g) {
+  OpDesc op1;
+  op1.SetType("op1");
+  OpDesc op2;
+  op2.SetType("op2");
+  OpDesc op3;
+  op3.SetType("op3");
+  OpDesc op4;
+  op4.SetType("op4");
+  OpDesc op5;
+  op5.SetType("op5");
+  VarDesc var1("var1");
+  VarDesc var2("var2");
+  VarDesc var3("var3");
+  VarDesc var4("var4");
+  ir::Node* o1 = g->CreateOpNode(&op1);
+  ir::Node* o2 = g->CreateOpNode(&op2);
+  ir::Node* o3 = g->CreateOpNode(&op3);
+  ir::Node* o4 = g->CreateOpNode(&op4);
+  ir::Node* o5 = g->CreateOpNode(&op5);
+  ir::Node* v1 = g->CreateVarNode(&var1);
+  ir::Node* v2 = g->CreateVarNode(&var2);
+  ir::Node* v3 = g->CreateVarNode(&var3);
+  ir::Node* v4 = g->CreateVarNode(&var4);
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  v2->inputs.push_back(o2);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+TEST(GraphToProgramPass, Basic) {
+  ProgramDesc prog;
+  std::unique_ptr<Graph> g(new Graph(prog));
+  BuildNoCircleGraph(g.get());
+  auto pass = paddle::framework::ir::PassRegistry::Instance().Get(
+      "graph_to_program_pass");
+  ProgramDesc compiled_prog;
+  pass->SetNotOwned<paddle::framework::ProgramDesc>("program", &compiled_prog);
+  pass->Apply(std::move(g));
+  std::vector<OpDesc*> ops = compiled_prog.Block(0).AllOps();
+  EXPECT_EQ(ops[0]->Type(), "op1");
+  EXPECT_EQ(ops[1]->Type(), "op2");
+  if (ops[2]->Type() == "op3") {
+    EXPECT_EQ(ops[3]->Type(), "op4");
+  } else if (ops[2]->Type() == "op4") {
+    EXPECT_EQ(ops[3]->Type(), "op3");
+  }
+  EXPECT_EQ(ops[4]->Type(), "op5");
+  std::unordered_set<std::string> vars;
+  for (VarDesc* v : compiled_prog.Block(0).AllVars()) {
+    vars.insert(v->Name());
+  }
+  EXPECT_TRUE(vars.find("var1") != vars.end());
+  EXPECT_TRUE(vars.find("var2") != vars.end());
+  EXPECT_TRUE(vars.find("var3") != vars.end());
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+USE_PASS(graph_to_program_pass);
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -16,11 +16,13 @@ limitations under the License. */
 #include <unordered_set>
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/inference/analysis/dot.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 static const char kGraphVizPath[] = "graph_viz_path";
+using inference::analysis::Dot;
 std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
@@ -30,41 +32,65 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
  PADDLE_ENFORCE(fout->good());
  std::ostream& sout = *fout;
-  size_t var_id = 0;
+  std::unordered_map<const ir::Node*, std::string> node2dot;
-  std::unordered_map<const ir::Node*, size_t> vars;
+  Dot dot;
-  sout << "digraph G {\n";
+  std::vector<Dot::Attr> op_attrs({Dot::Attr("style", "filled"),
-  for (const ir::Node* n : graph->Nodes()) {
+                                   Dot::Attr("shape", "box"),
-    if (n->NodeType() != ir::Node::Type::kVariable) continue;
+                                   Dot::Attr("fillcolor", "red")});
-    size_t cur_var_id = var_id++;
+  std::vector<Dot::Attr> var_attrs({Dot::Attr("style", "filled,rounded"),
-    vars[n] = cur_var_id;
+                                    // Dot::Attr("shape", "diamond"),
+                                    Dot::Attr("fillcolor", "yellow")});
-    sout << "var_" << cur_var_id << " [label=\"" << n->Name() << "\"]"
-         << std::endl;
+  std::vector<Dot::Attr> marked_op_attrs({Dot::Attr("style", "filled"),
-  }
+                                          Dot::Attr("shape", "box"),
+                                          Dot::Attr("fillcolor", "lightgray")});
-  size_t op_id = 0;
+  std::vector<Dot::Attr> marked_var_attrs(
-  for (const ir::Node* n : graph->Nodes()) {
+      {Dot::Attr("style", "filled,rounded"),
-    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+       // Dot::Attr("shape", "diamond"),
-    std::string op_name = "op_" + std::to_string(op_id++);
+       Dot::Attr("fillcolor", "lightgray")});
-    sout << op_name << " [label=\"" << n->Name() << "\", shape=rect]"
-         << std::endl;
+  auto marked_nodes = ConsumeMarkedNodes(graph.get());
-    for (auto in : n->inputs) {
+  // Create nodes
-      std::string var_name = "var_" + std::to_string(vars[in]);
+  for (const Node* n : graph->Nodes()) {
-      sout << var_name << " -> " << op_name << std::endl;
+    std::string node_id = n->Name() + "(" + std::to_string(n->id()) + ")";
+    if (n->IsOp()) {
+      decltype(op_attrs) attr =
+          marked_nodes.count(n) ? marked_op_attrs : op_attrs;
+      dot.AddNode(node_id, attr, node_id);
+    } else if (n->IsVar()) {
+      decltype(op_attrs) attr =
+          marked_nodes.count(n) ? marked_var_attrs : var_attrs;
+      dot.AddNode(node_id, attr, node_id);
    }
+    node2dot[n] = node_id;
-    for (auto out : n->outputs) {
+  }
-      std::string var_name = "var_" + std::to_string(vars[out]);
+  // Create edges
-      sout << op_name << " -> " << var_name << std::endl;
+  for (const Node* n : graph->Nodes()) {
+    const auto& src_id = node2dot.at(n);
+    for (auto* out : n->outputs) {
+      const auto& trg_id = node2dot.at(out);
+      dot.AddEdge(src_id, trg_id, {});
    }
  }
-  sout << "}\n";
+  sout << dot.Build();
  return graph;
 }
+GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes(
+    Graph* graph) const {
+  marked_nodes_t res;
+  if (graph->Has(kGraphvizMarkedNodeAttr)) {
+    auto& attr = graph->Get<marked_nodes_t>(kGraphvizMarkedNodeAttr);
+    res = attr;
+    attr.clear();
+  }
+  return res;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -27,10 +27,19 @@ namespace paddle {
 namespace framework {
 namespace ir {
+const char kGraphvizMarkedNodeAttr[] = "__graphviz__marked_node__";
 class GraphVizPass : public Pass {
+ public:
+  using marked_nodes_t = std::unordered_set<const Node*>;
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
      std::unique_ptr<ir::Graph> graph) const override;
+  // Tell whether there are any marked nodes in the graph. Consume the
+  // corresponding attribute.
+  marked_nodes_t ConsumeMarkedNodes(Graph* graph) const;
 };
 }  // namespace ir

--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -29,20 +29,26 @@ class Node {
  enum class Type { kOperation, kVariable };
  static constexpr char kControlDepVarName[] = "__control_var";
-  explicit Node(const std::string& name, Type type)
+  explicit Node(const std::string& name, Type type, int id = -1)
-      : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
+      : name_(name),
+        var_desc_(nullptr),
+        op_desc_(nullptr),
+        type_(type),
+        id_(id) {}
-  explicit Node(VarDesc* var_desc)
+  explicit Node(VarDesc* var_desc, int id = -1)
      : name_(var_desc->Name()),
        var_desc_(new VarDesc(*var_desc)),
        op_desc_(nullptr),
-        type_(Type::kVariable) {}
+        type_(Type::kVariable),
+        id_(id) {}
-  explicit Node(OpDesc* op_desc)
+  explicit Node(OpDesc* op_desc, int id = -1)
      : name_(op_desc->Type()),
        var_desc_(nullptr),
        op_desc_(new OpDesc(*op_desc, op_desc->Block())),
-        type_(Type::kOperation) {}
+        type_(Type::kOperation),
+        id_(id) {}
  Type NodeType() const { return type_; }
@@ -58,6 +64,8 @@ class Node {
    return op_desc_.get();
  }
+  int id() const { return id_; }
  bool IsOp() const { return type_ == Type::kOperation; }
  bool IsVar() const { return type_ == Type::kVariable; }
@@ -69,6 +77,7 @@ class Node {
  std::unique_ptr<VarDesc> var_desc_;
  std::unique_ptr<OpDesc> op_desc_;
  Type type_;
+  int id_;
 private:
  DISABLE_COPY_AND_ASSIGN(Node);

--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+struct FuseExpr {};
+// sequence expand, concat fuse pattern, return concat's output
+PDNode* BuildSeqExpandConcatPattern(PDPattern* pattern) {
+  // The following operators will be fused:
+  // concat
+  // sequence_expand
+  // sequence_expand
+  // The following variables will be treat as inputs:
+  // concat mid input, 0th input for fused op
+  // sequence_expand input, 1th input for fused op
+  // sequence_expand input, 2th input for fused op
+  // The following variables will be treat as outputs:
+  // concat output
+  // So the following variables will be removed:
+  // sequence-expand output
+  // sequence-expand output
+  // Three operators
+  auto* sequence_expand0 = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsOp() && x->Op()->Type() == "sequence_expand";
+      },
+      "sequence_expand0");
+  auto* sequence_expand1 = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsOp() && x->Op()->Type() == "sequence_expand";
+      },
+      "sequence_expand1");
+  auto* concat = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsOp() && x->Op()->Type() == "concat" &&  // basic check
+               x->Op()->Input("X").size() == 3;                  // Special case
+      },
+      "concat");
+  auto* sequence_expand0_in = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() && VarLinksToOp(x, "sequence_expand");
+      },
+      "sequence_expand0_in");
+  auto* sequence_expand1_in = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() && VarLinksToOp(x, "sequence_expand");
+      },
+      "sequence_expand1_in");
+  // The variables
+  auto* sequence_expand0_out = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&
+               VarLinksFromOp(x, "sequence_expand") &&  // basic check
+               VarLinksToOp(x, "concat") &&             // is concat's input
+               IsNthInput(x, x->outputs[0], "X", 1);    // X[0]
+      },
+      "sequence_expand0_out");
+  auto* sequence_expand1_out = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&
+               VarLinksFromOp(x, "sequence_expand") &&  // basic check
+               VarLinksToOp(x, "concat") &&             // is concat's input
+               IsNthInput(x, x->outputs[0], "X", 2);    // x[2]
+      },
+      "sequence_expand1_out");
+  auto* concat_in0 = pattern->NewNode(
+      [](Node* x) { return x && x->IsVar() && VarLinksToOp(x, "concat"); },
+      "concat_in0");
+  auto* concat_out = pattern->NewNode(
+      [](Node* x) { return x && x->IsVar() && VarLinksFromOp(x, "concat"); },
+      "concat_out");
+  // Links
+  sequence_expand0->LinksFrom({sequence_expand0_in})
+      .LinksTo({sequence_expand0_out});
+  sequence_expand1->LinksFrom({sequence_expand1_in})
+      .LinksTo({sequence_expand1_out});
+  concat->LinksFrom({sequence_expand0_out, sequence_expand1_out, concat_in0})
+      .LinksTo({concat_out});
+  return concat_out;
+}
+PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) {
+  PDNode* fc_w = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&                 // basic
+               VarLinksToOp(x, "mul") &&          // link
+               x->Var()->Proto()->persistable();  // is a parameter
+      },
+      "fc_w");
+  PDNode* mul_out = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&                     // basic
+               VarLinksFromOp(x, "mul") &&            // link
+               VarLinksToOp(x, "elementwise_add") &&  //
+               !x->Var()->Proto()->persistable();     // is a parameter
+      },
+      "mul_out");
+  PDNode* fc_mul = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsOp() && x->Op()->Type() == "mul";  // basic
+      },
+      "fc_mul");
+  PDNode* fc_bias = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&                     // basic
+               VarLinksToOp(x, "elementwise_add") &&  // link
+               x->Var()->Proto()->persistable();      // is a parameter
+      },
+      "fc_bias");
+  PDNode* elementwise_add = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsOp() && x->Op()->Type() == "elementwise_add";
+      },
+      "elementwise_add");
+  PDNode* add_out = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&                       // basic
+               VarLinksFromOp(x, "elementwise_add") &&  // link
+               !x->Var()->Proto()->persistable();       // is a parameter
+      },
+      "add_out");
+  std::set<std::string> acts({"sigmoid", "tanh", "relu", "identity"});
+  PDNode* act = pattern->NewNode(
+      [=](Node* x) {
+        return x && x->IsOp() && acts.count(x->Op()->Type());
+      },
+      "act");
+  PDNode* fc_out = pattern->NewNode(
+      [](Node* x) {
+        return x && x->IsVar() &&                  // basic
+               !x->Var()->Proto()->persistable();  // is a parameter
+      },
+      "fc_out");
+  fc_mul->LinksFrom({fc_w, fc_x}).LinksTo({mul_out});
+  elementwise_add->LinksFrom({mul_out, fc_bias}).LinksTo({add_out});
+  act->LinksFrom({add_out}).LinksTo({fc_out});
+  return fc_out;
+}
+std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(graph.get());
+  GraphPatternDetector detector;
+  auto* pattern = detector.mutable_pattern();
+  auto* concat_out = BuildSeqExpandConcatPattern(pattern);
+  BuildFCPattern(pattern, concat_out);
+#define GET_NODE(id, pattern)                              \
+  PADDLE_ENFORCE(subgraph.count(pattern.RetriveNode(#id)), \
+                 "pattern has no Node called %s", #id);    \
+  auto* id = subgraph.at(pattern.RetriveNode(#id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+  detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph,
+                            Graph* graph) {
+    VLOG(4) << "get one concat pattern";
+    // fc
+    GET_NODE(fc_w, detector.pattern());
+    GET_NODE(fc_bias, detector.pattern());
+    GET_NODE(act, detector.pattern());
+    GET_NODE(fc_out, detector.pattern());
+    // concat
+    GET_NODE(concat_in0, detector.pattern());
+    GET_NODE(sequence_expand0_in, detector.pattern());
+    GET_NODE(sequence_expand1_in, detector.pattern());
+    OpDesc op_desc;
+    op_desc.SetType("fusion_seqexpand_concat_fc");
+    op_desc.SetInput("X", {concat_in0->Name(), sequence_expand0_in->Name(),
+                           sequence_expand1_in->Name()});
+    op_desc.SetInput("FCWeight", {fc_w->Name()});
+    op_desc.SetInput("FCBias", {fc_bias->Name()});
+    const std::string fc_out_tmp = fc_out->Name() + ".tmp";
+    param_scope()->Var(fc_out_tmp)->GetMutable<framework::LoDTensor>();
+    op_desc.SetOutput("FCOut", {fc_out_tmp});
+    op_desc.SetOutput("Out", {fc_out->Name()});
+    op_desc.SetAttr("fc_activation", act->Op()->Type());
+    auto* op_node = graph->CreateOpNode(&op_desc);
+// Add links
+#define NODE_LINKS(a, b)   \
+  a->outputs.push_back(b); \
+  b->inputs.push_back(a);
+    NODE_LINKS(fc_w, op_node);
+    NODE_LINKS(fc_bias, op_node);
+    NODE_LINKS(concat_in0, op_node);
+    NODE_LINKS(sequence_expand0_in, op_node);
+    NODE_LINKS(sequence_expand1_in, op_node);
+    NODE_LINKS(op_node, fc_out);
+    // Clean nodes.
+    std::unordered_set<const Node*> marked_nodes;
+    for (auto& item : subgraph) {
+      marked_nodes.insert(item.second);
+    }
+    marked_nodes.erase(fc_w);
+    marked_nodes.erase(fc_bias);
+    marked_nodes.erase(concat_in0);
+    marked_nodes.erase(sequence_expand0_in);
+    marked_nodes.erase(sequence_expand1_in);
+    marked_nodes.erase(fc_out);
+    GraphSafeRemoveNodes(graph, marked_nodes);
+  });
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(seq_concat_fc_fuse_pass,
+              paddle::framework::ir::SeqConcatFcFusePass);
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class SeqConcatFcFusePass : public FusePassBase {
+ public:
+  virtual ~SeqConcatFcFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -95,6 +95,12 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
  need_update_ = true;
 }
+OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) {
+  CopyFrom(other);
+  block_ = block;
+  need_update_ = true;
+}
 void OpDesc::CopyFrom(const OpDesc &op_desc) {
  desc_.set_type(op_desc.Type());
  inputs_ = op_desc.inputs_;
@@ -131,8 +137,9 @@ OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
    std::string attr_name = attr.name();
    // The sub_block referred to by the BLOCK attr hasn't been added
-    // to ProgramDesc class yet, we skip setting BLOCK attr here.
+    // to ProgramDesc class yet, we skip setting BLOCK/BLOCKS attr here.
-    if (attr.type() != proto::AttrType::BLOCK) {
+    if (attr.type() != proto::AttrType::BLOCK &&
+        attr.type() != proto::AttrType::BLOCKS) {
      attrs_[attr_name] = GetAttrValue(attr);
    }
  }

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -37,11 +37,7 @@ class OpDesc {
  explicit OpDesc(BlockDesc *block) : block_(block) {}
-  OpDesc(const OpDesc &other, BlockDesc *block) {
+  OpDesc(const OpDesc &other, BlockDesc *block);
-    *this = other;
-    block_ = block;
-    need_update_ = true;
-  }
  void CopyFrom(const OpDesc &op_desc);

--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -80,6 +80,12 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
  InitFromProto();
 }
+void ProgramDesc::CopyFrom(const proto::ProgramDesc &desc) {
+  blocks_.clear();
+  desc_ = desc;
+  InitFromProto();
+}
 ProgramDesc::ProgramDesc(const std::string &binary_str) {
  PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
                 "Fail to parse program_desc from binary string.");
@@ -111,10 +117,16 @@ void ProgramDesc::InitFromProto() {
 const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
  auto &global_block = Block(0);
+  // The order of feed_target_names must follow the index specified in `col`.
+  // since feed operator's order doesn't necessary follow 'col'.
  std::vector<std::string> feed_target_names;
  for (auto *op : global_block.AllOps()) {
    if (op->Type() == kFeedOpType) {
-      feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
+      int col = boost::get<int>(op->GetAttr("col"));
+      if (col >= feed_target_names.size()) {
+        feed_target_names.resize(col + 1);
+      }
+      feed_target_names[col] = op->Output("Out")[0];
    }
  }
  return feed_target_names;
@@ -122,10 +134,16 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
 const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
  auto &global_block = Block(0);
+  // The order of fetch_target_names must follow the index specified in `col`.
+  // since fetch operator's order doesn't necessary follow 'col'.
  std::vector<std::string> fetch_target_names;
  for (auto *op : global_block.AllOps()) {
    if (op->Type() == kFetchOpType) {
-      fetch_target_names.push_back(op->Input("X")[0]);
+      int col = boost::get<int>(op->GetAttr("col"));
+      if (col >= fetch_target_names.size()) {
+        fetch_target_names.resize(col + 1);
+      }
+      fetch_target_names[col] = op->Input("X")[0];
    }
  }
  return fetch_target_names;

--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -53,6 +53,8 @@ class ProgramDesc {
  void Flush();
+  void CopyFrom(const proto::ProgramDesc &desc);
  proto::ProgramDesc *Proto();
  // The output variable of feed_op is referenced as feed_target.

--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -40,7 +40,11 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
                    "When calling this method, the Tensor's numel must be "
                    "equal or larger than zero. "
                    "Please check Tensor::Resize has been called first.");
-  size_t size = requested_size ? requested_size : numel() * SizeOfType(type);
+  size_t size = numel() * SizeOfType(type);
+  if (requested_size) {
+    PADDLE_ENFORCE_GE(requested_size, size);
+    size = requested_size;
+  }
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {

--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -26,7 +26,7 @@ namespace paddle {
 namespace framework {
 template <typename T>
-bool IsType(const std::type_index& type_index) {
+inline bool IsType(const std::type_index& type_index) {
  return type_index == std::type_index(typeid(T));
 }

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
    SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass)
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
 cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
-cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+set(analysis_deps
+    framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor)
+cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
  analyzer.cc
  helper.cc
  # passes
@@ -10,11 +13,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
  tensorrt_subgraph_node_mark_pass.cc
  fluid_to_ir_pass.cc
  model_store_pass.cc
-  DEPS framework_proto proto_desc ir_pass_manager graph pass)
+  DEPS ${analysis_deps})
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
-cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
+cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
@@ -31,7 +34,7 @@ function (inference_analysis_test TARGET)
        endif()
        cc_test(${TARGET}
                SRCS "${analysis_test_SRCS}"
-                DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detecter pass ${analysis_test_EXTRA_DEPS}
+                DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS}
                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
    endif(WITH_TESTING)
@@ -58,20 +61,25 @@ endif()
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
+    analysis_predictor
 		# ir
 		fc_fuse_pass
+		fc_lstm_fuse_pass
+    seq_concat_fc_fuse_pass
 		graph_viz_pass
 		infer_clean_graph_pass
-		graph_pattern_detecter
+		graph_pattern_detector
-        infer_clean_graph_pass
+    infer_clean_graph_pass
+    attention_lstm_fuse_pass
+    paddle_inference_api
 		pass
    ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
        --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
        --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
-inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api)
-inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
+inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid)
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -72,7 +72,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
      auto trt_teller = [&](const Node* node) {
        std::unordered_set<std::string> teller_set(
            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
-             "depthwise_conv2d", "batch_norm"});
+             "depthwise_conv2d", "batch_norm", "concat"});
        if (!node->IsFunction()) return false;
        const auto* func = static_cast<const Function*>(node);
@@ -102,6 +102,19 @@ class DfgPassManagerImpl final : public DfgPassManager {
 Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
 void Analyzer::Run(Argument* argument) {
+  // Ungly support fluid-to-ir-pass
+  argument->Set(kFluidToIrPassesAttr,
+                new std::vector<std::string>({
+                    // Manual update the passes here.
+                    "graph_viz_pass",                              //
+                    "infer_clean_graph_pass", "graph_viz_pass",    //
+                    "attention_lstm_fuse_pass", "graph_viz_pass",  //
+                    "fc_lstm_fuse_pass", "graph_viz_pass",         //
+                    "seq_concat_fc_fuse_pass", "graph_viz_pass",   //
+                    "fc_fuse_pass", "graph_viz_pass"               //
+                }));
  for (auto& x : data_) {
    PADDLE_ENFORCE(x->Initialize(argument));
    x->RunAll();

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/profiler.h"
 DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
 DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
@@ -264,39 +265,24 @@ void TestDituRNNPrediction(const std::string &model_path,
                           const std::string &data_path, int batch_size,
                           bool use_analysis, bool activate_ir,
                           int num_times = 1) {
-  FLAGS_IA_enable_ir = activate_ir;
-  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
-  FLAGS_IA_output_storage_path = "./analysis.out";
-  std::string model_out;
-  if (use_analysis) {
-    Argument argument(model_path);
-    argument.model_output_store_path.reset(new std::string("./analysis.out"));
-    Analyzer analyzer;
-    analyzer.Run(&argument);
-    // Should get the transformed model stored to ./analysis.out
-    model_out = "./analysis.out";
-    ASSERT_TRUE(PathExists(model_out));
-  } else {
-    model_out = FLAGS_infer_ditu_rnn_model;
-  }
  NativeConfig config;
-  config.prog_file = model_out + "/__model__";
+  config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
-  config.param_file = model_out + "/param";
+  config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
  config.use_gpu = false;
  config.device = 0;
  config.specify_input_name = true;
-  auto predictor =
+  auto base_predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config);
  std::vector<PaddleTensor> input_slots;
  DataRecord data(data_path, batch_size);
  // Prepare inputs.
  PrepareInputs(&input_slots, &data, batch_size);
-  std::vector<PaddleTensor> outputs;
+  std::vector<PaddleTensor> outputs, base_outputs;
+  base_predictor->Run(input_slots, &base_outputs);
  Timer timer;
  timer.tic();
@@ -308,37 +294,25 @@ void TestDituRNNPrediction(const std::string &model_path,
            << ", latency: " << timer.toc() / num_times << "ms";
  LOG(INFO) << "=====================================";
-  for (auto &out : outputs) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &base_out = base_outputs[i];
    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
                                  [](int a, int b) { return a * b; });
+    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+                                   1, [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_EQ(size, size1);
+    PADDLE_ENFORCE_GT(size, 0);
    float *data = static_cast<float *>(out.data.data());
-    for (size_t i = 0;
+    float *base_data = static_cast<float *>(base_out.data.data());
-         i < std::min(sizeof(ditu_rnn_target_data) / sizeof(float), size);
+    for (size_t i = 0; i < size; i++) {
-         i++) {
+      EXPECT_NEAR(data[i], base_data[i], 1e-3);
-      EXPECT_NEAR(data[i], ditu_rnn_target_data[i], 1e-3);
    }
  }
 }
-// Turn on the IR pass supportion, run a real inference and check the result.
-TEST(Analyzer, SupportIRPass) {
-  FLAGS_IA_enable_ir = true;
-  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
-  FLAGS_IA_output_storage_path = "./analysis.out";
-  Argument argument(FLAGS_inference_model_dir);
-  argument.model_output_store_path.reset(new std::string("./analysis.out"));
-  Analyzer analyzer;
-  analyzer.Run(&argument);
-  // Should get the transformed model stored to ./analysis.out
-  ASSERT_TRUE(PathExists("./analysis.out"));
-  // Inference from this path.
-  TestWord2vecPrediction("./analysis.out");
-}
 // Directly infer with the original model.
 TEST(Analyzer, DituRNN_without_analysis) {
  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
@@ -365,5 +339,8 @@ TEST(Analyzer, DituRNN_with_analysis_with_IR) {
 }  // namespace paddle
 USE_PASS(fc_fuse_pass);
+USE_PASS(seq_concat_fc_fuse_pass);
+USE_PASS(fc_lstm_fuse_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(infer_clean_graph_pass);
+USE_PASS(attention_lstm_fuse_pass);
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -26,6 +26,7 @@
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/platform/variant.h"
 namespace paddle {
 namespace inference {
@@ -58,6 +59,46 @@ struct Argument {
  // The output storage path of ModelStorePass.
  std::unique_ptr<std::string> model_output_store_path;
+  // Support for any other attributes.
+  template <typename T>
+  void Set(const std::string& key, T* data) {
+    PADDLE_ENFORCE_NOT_NULL(data);
+    PADDLE_ENFORCE(!attrs_.count(key), "duplicate attr called %s", key);
+    attrs_[key] = data;
+    attr_deleters_[key] = [data, key, this]() {
+      VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
+      VLOG(3) << "argument delete attr: " << key;
+      delete data;
+    };
+  }
+  bool Has(const std::string& name) const { return attrs_.count(name); }
+  template <typename T>
+  T* Release(const std::string& key) {
+    PADDLE_ENFORCE(attrs_.count(key));
+    auto* res = boost::any_cast<T*>(attrs_.at(key));
+    attrs_.erase(key);
+    attr_deleters_.erase(key);
+    return res;
+  }
+  template <typename T>
+  T& Get(const std::string& key) {
+    PADDLE_ENFORCE(Has(key));
+    return *boost::any_cast<T*>(attrs_.at(key));
+  }
+  ~Argument() {
+    for (auto& item : attr_deleters_) {
+      item.second();
+    }
+  }
+ private:
+  std::unordered_map<std::string, boost::any> attrs_;
+  std::unordered_map<std::string, std::function<void()>> attr_deleters_;
 };
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/proto_desc.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/io.h"
 namespace paddle {
 namespace inference {
@@ -65,6 +66,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
    }
  }
+  if (argument_->Has("param_scope")) {
+    LOG(WARNING) << "parameter changes in the scope takes effect";
+  }
  PADDLE_ENFORCE(argument_->transformed_program_desc.get());
 }

--- a/paddle/fluid/inference/analysis/dot.h
+++ b/paddle/fluid/inference/analysis/dot.h
@@ -29,13 +29,13 @@ namespace paddle {
 namespace inference {
 namespace analysis {
+static size_t dot_node_counter{0};
 /*
 * A Dot template that helps to build a DOT graph definition.
 */
 class Dot {
 public:
-  static size_t counter;
  struct Attr {
    std::string key;
    std::string value;
@@ -57,7 +57,7 @@ class Dot {
    Node(const std::string& name, const std::vector<Attr>& attrs)
        : name(name),
          attrs(attrs),
-          id_("node_" + std::to_string(Dot::counter++)) {}
+          id_("node_" + std::to_string(dot_node_counter++)) {}
    std::string id() const { return id_; }
@@ -65,6 +65,10 @@ class Dot {
      std::stringstream ss;
      CHECK(!name.empty());
      ss << id_;
+      if (attrs.empty()) {
+        ss << "[label=" << '"' << name << '"' << "]";
+        return ss.str();
+      }
      for (size_t i = 0; i < attrs.size(); i++) {
        if (i == 0) {
          ss << "[label=" << '"' << name << '"' << " ";
@@ -108,9 +112,11 @@ class Dot {
  explicit Dot(const std::vector<Attr>& attrs) : attrs_(attrs) {}
-  void AddNode(const std::string& name, const std::vector<Attr>& attrs) {
+  void AddNode(const std::string& id, const std::vector<Attr>& attrs,
-    CHECK(!nodes_.count(name)) << "duplicate Node '" << name << "'";
+               std::string label = "") {
-    nodes_.emplace(name, Node{name, attrs});
+    CHECK(!nodes_.count(id)) << "duplicate Node '" << id << "'";
+    if (label.empty()) label = id;
+    nodes_.emplace(id, Node{label, attrs});
  }
  void AddEdge(const std::string& source, const std::string& target,

--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
@@ -13,3 +13,47 @@
 // limitations under the License.
 #include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+void FluidToIrPass::EnableParamModify(const std::string &model_dir,
+                                      const std::string &prog_file,
+                                      const std::string &param_file) {
+  PADDLE_ENFORCE(argument_);
+  argument_->Set("param_scope", new framework::Scope);
+  // Load parameters.
+  VLOG(3) << "Loading parameters from " << model_dir;
+  LoadParams(&argument_->Get<framework::Scope>("param_scope"), model_dir,
+             prog_file, param_file);
+}
+bool FluidToIrPass::LoadParams(framework::Scope *scope, const std::string &dir,
+                               const std::string &prog_file,
+                               const std::string &param_file) {
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  framework::Executor executor(place);
+  PADDLE_ENFORCE(argument_->origin_program_desc.get());
+  framework::ProgramDesc program(*argument_->origin_program_desc);
+  if ((!prog_file.empty()) && (!param_file.empty())) {
+    LOG(INFO) << "load single model file from " << prog_file;
+    Load(&executor, scope, prog_file, param_file);
+  } else if (!dir.empty()) {
+    LOG(INFO) << "load from dir " << dir;
+    Load(&executor, scope, dir);
+  } else {
+    LOG(ERROR) << "failed to load parameters";
+    return false;
+  }
+  return true;
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
@@ -21,12 +21,17 @@ namespace paddle {
 namespace inference {
 namespace analysis {
+static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__";
 class FluidToIrPass final : public DataFlowGraphPass {
 public:
  FluidToIrPass() = default;
  bool Initialize(Argument *argument) override {
    ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
+    PADDLE_ENFORCE(argument->Has(kFluidToIrPassesAttr),
+                   "argument need the attr %s", kFluidToIrPassesAttr);
+    argument_ = argument;
    if (argument->origin_program_desc) {
      LOG(WARNING) << "argument's origin_program_desc is already set, might "
                      "duplicate called";
@@ -46,12 +51,21 @@ class FluidToIrPass final : public DataFlowGraphPass {
    if (!argument->main_dfg) {
      argument->main_dfg.reset(new DataFlowGraph);
    }
-    // Persist the ProgramDesc in graph's attribute. The IR graph just keep the
+    argument->Set("ir_program_desc", new framework::ProgramDesc(program));
-    // address, will segfault if the original ProgramDesc destroys.
-    auto &ir_program_p = argument->main_dfg->Attr("ir_program_desc").Pointer();
+    LOG(INFO) << "Loading parameters";
-    ir_program_p = new framework::ProgramDesc(program);
+    // Load parameters to argument if needed.
+    if (argument->fluid_model_dir || (argument->fluid_model_program_path &&
+                                      argument->fluid_model_param_path)) {
+#define SAFE_GET(ATTR) std::string ATTR = argument->ATTR ? *argument->ATTR : "";
+      SAFE_GET(fluid_model_dir);
+      SAFE_GET(fluid_model_program_path);
+      SAFE_GET(fluid_model_param_path);
+#undef SAFE_GET
+      EnableParamModify(fluid_model_dir, fluid_model_program_path,
+                        fluid_model_param_path);
+    }
-    argument_ = argument;
    return true;
  }
@@ -59,20 +73,36 @@ class FluidToIrPass final : public DataFlowGraphPass {
  void Run(DataFlowGraph *graph) override {
    // Call all the IR Passes
-    IRPassManager ir_passes(*static_cast<framework::ProgramDesc *>(
+    IRPassManager ir_passes(
-        argument_->main_dfg->Attr("ir_program_desc").Pointer()));
+        argument_->Get<framework::ProgramDesc>("ir_program_desc"), nullptr);
-    ir_passes.Apply(std::vector<std::string>(
+    // Pass the scope from analysis to IR if needed.
-        {// Manual update the passes here.
+    if (argument_->Has("param_scope")) {
-         "graph_viz_pass", "infer_clean_graph_pass", "graph_viz_pass",
+      // Here the address is passed, attention that IR doesn't own the scope, so
-         "fc_fuse_pass", "graph_viz_pass"}));
+      // the real scope in analysis should live during the IR phase.
+      ir_passes.graph().Set(
+          "param_scope", new framework::Scope *(
+                             &argument_->Get<framework::Scope>("param_scope")));
+    }
+    const auto &ir_passes_to_apply =
+        argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
+    ir_passes.Apply(ir_passes_to_apply);
    PADDLE_ENFORCE(argument_->main_dfg.get());
    argument_->main_dfg->Build(ir_passes.graph());
-    // PADDLE_ENFORCE(argument_->main_dfg->IsFullyConnected());
  }
+  void EnableParamModify(const std::string &model_dir,
+                         const std::string &prog_file,
+                         const std::string &param_file);
  std::string repr() const override { return "fluid-to-ir-pass"; }
+ private:
+  // Load parameters from a single file or from a directory.
+  bool LoadParams(framework::Scope *scope, const std::string &dir,
+                  const std::string &prog_file, const std::string &param_file);
 private:
  Argument *argument_{nullptr};
 };

--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
@@ -24,6 +24,8 @@ namespace analysis {
 TEST(FluidToIrPass, Test) {
  FluidToIrPass pass;
  Argument argument(FLAGS_inference_model_dir);
+  argument.Set(kFluidToIrPassesAttr,
+               new std::vector<std::string>({"infer_clean_graph_pass"}));
  pass.Initialize(&argument);
  pass.Run(argument.main_dfg.get());
 }
@@ -32,6 +34,9 @@ TEST(FluidToIrPass, Test) {
 }  // namespace inference
 }  // namespace paddle
-USE_PASS(fc_fuse_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(infer_clean_graph_pass);
+USE_PASS(attention_lstm_fuse_pass);
+USE_PASS(fc_lstm_fuse_pass);
+USE_PASS(seq_concat_fc_fuse_pass);
+USE_PASS(fc_fuse_pass);
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -14,20 +14,24 @@
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 #include <string>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/scope.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
-IRPassManager::IRPassManager(const ProgramDesc& program) {
+IRPassManager::IRPassManager(const ProgramDesc &program,
+                             framework::Scope *scope)
+    : program_(program) {
  graph_.reset(new framework::ir::Graph(program));
+  if (scope) graph_->Set("param_scope", new framework::Scope *(scope));
 }
-void IRPassManager::Apply(const std::vector<std::string>& passes) {
+void IRPassManager::Apply(const std::vector<std::string> &passes) {
-  graph_->Set("graph_viz_path", new std::string("./1.dot"));
  // Apply all the passes
  std::string pre_pass;
-  for (const std::string& pass_name : passes) {
+  for (const std::string &pass_name : passes) {
    LOG(WARNING) << "Running IR pass [" << pass_name << "]";
    auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
    if (pass_name == "graph_viz_pass") {

--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
 namespace paddle {
 namespace inference {
@@ -31,14 +32,15 @@ using framework::ProgramDesc;
 class IRPassManager final {
 public:
-  IRPassManager(const ProgramDesc& program);
+  IRPassManager(const ProgramDesc &program, framework::Scope *scope);
-  void Apply(const std::vector<std::string>& passes);
+  void Apply(const std::vector<std::string> &passes);
-  framework::ir::Graph& graph() const { return *graph_; }
+  framework::ir::Graph &graph() const { return *graph_; }
 private:
  std::unique_ptr<framework::ir::Graph> graph_;
+  ProgramDesc program_;
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -33,9 +33,9 @@ bool PassManager::Initialize(Argument* argument) {
 void DfgPassManager::RunAll() {
  PADDLE_ENFORCE(argument_);
-  LOG(INFO) << "Total " << data_.size() << " passes";
+  LOG(INFO) << "Total " << data_.size() << " Analysys passes";
  for (auto& pass : data_) {
-    LOG(WARNING) << "Running pass [" << pass->repr() << "]";
+    LOG(WARNING) << "Running Analysis pass [" << pass->repr() << "]";
    pass->Run(argument_->main_dfg.get());
  }
 }

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -20,7 +20,7 @@ endif(APPLE)
 set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
  graph_viz_pass fc_fuse_pass
-    infer_clean_graph_pass
+  infer_clean_graph_pass
  )
 if(WITH_GPU AND TENSORRT_FOUND)
@@ -46,7 +46,8 @@ function(inference_api_test TARGET_NAME)
    endif(WITH_TESTING)
 endfunction(inference_api_test)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api)
 cc_test(test_paddle_inference_api
        SRCS api_tester.cc

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+namespace paddle {
+using inference::analysis::Argument;
+using inference::Singleton;
+using inference::analysis::Analyzer;
+using framework::proto::ProgramDesc;
+/* This predictor is based on the original native predictor with IR and Analysis
+ * support. It will optimize IR and Parameters in the runtime.
+ * TODO(Superjomn) Replace the Navive predictor?
+ */
+class AnalysisPredictor : public NativePaddlePredictor {
+ public:
+  explicit AnalysisPredictor(const NativeConfig& config)
+      : NativePaddlePredictor(config), config_(config) {}
+  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
+    VLOG(3) << "Predictor::init()";
+    if (config_.use_gpu) {
+      place_ = paddle::platform::CUDAPlace(config_.device);
+    } else {
+      place_ = paddle::platform::CPUPlace();
+    }
+    PADDLE_ENFORCE(!parent_scope);
+    if (parent_scope) {
+      scope_ = parent_scope;
+      sub_scope_ = &(parent_scope->NewScope());
+    } else {
+      paddle::framework::InitDevices(false);
+      scope_.reset(new paddle::framework::Scope());
+    }
+    executor_.reset(new paddle::framework::Executor(place_));
+    // Initialize the inference program
+    if (!config_.model_dir.empty()) {
+      // Parameters are saved in separate files sited in
+      // the specified `dirname`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.model_dir);
+    } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+      // All parameters are saved in a single file.
+      // The file names should be consistent with that used
+      // in Python API `fluid.io.save_inference_model`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+    } else {
+      LOG(ERROR) << "fail to load inference model.";
+      return false;
+    }
+    OptimizeInferenceProgram();
+    ctx_ = executor_->Prepare(*inference_program_, 0);
+    VLOG(5) << "to create variables";
+    PADDLE_ENFORCE(scope_.get());
+    executor_->CreateVariables(*inference_program_,
+                               sub_scope_ ? sub_scope_ : scope_.get(), 0);
+    // Get the feed_target_names and fetch_target_names
+    feed_target_names_ = inference_program_->GetFeedTargetNames();
+    fetch_target_names_ = inference_program_->GetFetchTargetNames();
+    return true;
+  }
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override {
+    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
+  }
+  void OptimizeInferenceProgram() {
+    LOG(INFO) << "optimize begin";
+    FLAGS_IA_enable_ir = true;
+    FLAGS_IA_enable_tensorrt_subgraph_engine = false;
+    FLAGS_IA_output_storage_path = "";  // Don't output the model.
+    // Analyze inference_program
+    Argument argument;
+    if (!config_.model_dir.empty()) {
+      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
+    } else {
+      PADDLE_ENFORCE(
+          !config_.param_file.empty(),
+          "Either model_dir or (param_file, prog_file) should be set.");
+      PADDLE_ENFORCE(!config_.prog_file.empty());
+      argument.fluid_model_program_path.reset(
+          new std::string(config_.prog_file));
+      argument.fluid_model_param_path.reset(
+          new std::string(config_.param_file));
+    }
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "to prepare executor";
+    // LOG(INFO) << "transformed_parogram_desc " <<
+    // argument.transformed_program_desc->DebugString();
+    inference_program_.reset(
+        new framework::ProgramDesc(*argument.transformed_program_desc));
+    PADDLE_ENFORCE(argument.Has("param_scope"));
+    // Update scope.
+    scope_.reset(argument.Release<framework::Scope>("param_scope"));
+    LOG(INFO) << "optimize end ==";
+  }
+ private:
+  NativeConfig config_;
+};
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
+    NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) {
+  VLOG(3) << "create NativePredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory, 0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         std::to_string(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+  std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
+  if (!dynamic_cast<AnalysisPredictor*>(predictor.get())->Init(nullptr)) {
+    return nullptr;
+  }
+  return predictor;
+}
+}  // namespace paddle
+USE_PASS(fc_fuse_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(infer_clean_graph_pass);
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -32,6 +32,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
      : NativePaddlePredictor(config), config_(config) {}
  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
+    FLAGS_IA_enable_tensorrt_subgraph_engine = true;
    VLOG(3) << "Predictor::init()";
    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
    FLAGS_tensorrt_workspace_size = config_.workspace_size;
@@ -161,3 +162,4 @@ USE_TRT_CONVERTER(fc);
 USE_TRT_CONVERTER(pool2d);
 USE_TRT_CONVERTER(softmax);
 USE_TRT_CONVERTER(batch_norm);
+USE_TRT_CONVERTER(concat);
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -37,6 +37,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
  config1.use_gpu = true;
  config1.fraction_of_gpu_memory = 0.3;
  config1.device = 0;
+  config1.max_batch_size = 10;
  auto predictor0 =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);

--- a/paddle/fluid/inference/api/helper.cc
+++ b/paddle/fluid/inference/api/helper.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/api/helper.h"
+namespace paddle {
+namespace inference {
+template <>
+std::string to_string<std::vector<float>>(
+    const std::vector<std::vector<float>> &vec) {
+  std::stringstream ss;
+  for (const auto &piece : vec) {
+    ss << to_string(piece) << "\n";
+  }
+  return ss.str();
+}
+template <>
+std::string to_string<std::vector<std::vector<float>>>(
+    const std::vector<std::vector<std::vector<float>>> &vec) {
+  std::stringstream ss;
+  for (const auto &line : vec) {
+    for (const auto &rcd : line) {
+      ss << to_string(rcd) << ";\t";
+    }
+    ss << '\n';
+  }
+  return ss.str();
+}
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -44,7 +44,8 @@ class Timer {
  }
 };
-void split(const std::string &str, char sep, std::vector<std::string> *pieces) {
+static void split(const std::string &str, char sep,
+                  std::vector<std::string> *pieces) {
  pieces->clear();
  if (str.empty()) {
    return;
@@ -60,7 +61,8 @@ void split(const std::string &str, char sep, std::vector<std::string> *pieces) {
    pieces->push_back(str.substr(pos));
  }
 }
-void split_to_float(const std::string &str, char sep, std::vector<float> *fs) {
+static void split_to_float(const std::string &str, char sep,
+                           std::vector<float> *fs) {
  std::vector<std::string> pieces;
  split(str, sep, &pieces);
  std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs),
@@ -76,27 +78,14 @@ std::string to_string(const std::vector<T> &vec) {
 }
 template <>
 std::string to_string<std::vector<float>>(
-    const std::vector<std::vector<float>> &vec) {
+    const std::vector<std::vector<float>> &vec);
-  std::stringstream ss;
-  for (const auto &piece : vec) {
-    ss << to_string(piece) << "\n";
-  }
-  return ss.str();
-}
 template <>
 std::string to_string<std::vector<std::vector<float>>>(
-    const std::vector<std::vector<std::vector<float>>> &vec) {
+    const std::vector<std::vector<std::vector<float>>> &vec);
-  std::stringstream ss;
-  for (const auto &line : vec) {
-    for (const auto &rcd : line) {
-      ss << to_string(rcd) << ";\t";
-    }
-    ss << '\n';
-  }
-  return ss.str();
-}
 // clang-format off
-void TensorAssignData(PaddleTensor *tensor, const std::vector<std::vector<float>> &data) {
+static void TensorAssignData(PaddleTensor *tensor, const std::vector<std::vector<float>> &data) {
  // Assign buffer
  int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, [](int a, int b) { return a * b; });
  tensor->data.Resize(sizeof(float) * dim);

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -77,6 +77,7 @@ enum class PaddleEngineKind {
  kNative = 0,         // Use the native Fluid facility.
  kAnakin,             // Use Anakin for inference.
  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+  kAnalysis
  // TODO(Superjomn) support following engines latter.
  // kTensorRT,           // Use TensorRT for inference.
  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -143,5 +143,21 @@ std::unique_ptr<framework::ProgramDesc> Load(
  return main_program;
 }
+void SaveVars(const framework::Scope& scope,
+              const std::vector<std::string>& vars, const std::string& dirname,
+              bool predicate) {
+  framework::ProgramDesc prog;
+  auto* block = prog.MutableBlock(0);
+  auto* op = block->AppendOp();
+  op->SetType("save_combine");
+  op->SetInput("X", vars);
+  op->SetAttr("file_path", dirname + "/param");
+  op->CheckAttrs();
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  exe.Run(prog, const_cast<framework::Scope*>(&scope), 0, true, true);
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -41,5 +41,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                             const std::string& prog_filename,
                                             const std::string& param_filename);
+// Save the variables from a scope to disk.
+void SaveVars(const framework::Scope& scope,
+              const std::vector<std::string>& vars, const std::string& dirname,
+              bool predicate = true);
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 # Add TRT tests
 nv_library(tensorrt_converter
  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc 
+batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc
  DEPS tensorrt_engine operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -18,12 +18,12 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
 nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
+nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ */
+class ConcatOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    std::vector<nvinfer1::ITensor*> itensors;
+    for (auto& input_name : op_desc.Input("X")) {
+      itensors.push_back(engine_->GetITensor(input_name));
+    }
+    int axis = boost::get<int>(op_desc.GetAttr("axis"));
+    PADDLE_ENFORCE(axis > 0,
+                   "The axis attr of Concat op should be large than 0 for trt");
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(),
+                                       itensors.size());
+    axis = axis - 1;  // Remove batch dim
+    layer->setAxis(axis);
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(concat, ConcatOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -79,6 +79,14 @@ class OpConverter {
        it =
            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
      }
+      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                              op_desc.Type());
+    }
+    if (op_desc.Type() == "depthwise_conv2d") {
+      it = Registry<OpConverter>::Lookup("conv2d");
+      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                              op_desc.Type());
    }
    if (!it) {

--- a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+TEST(concat_op, test) {
+  std::unordered_set<std::string> parameters({""});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("concat_x1", nvinfer1::DimsCHW(10, 3, 1));
+  validator.DeclInputVar("concat_x2", nvinfer1::DimsCHW(3, 3, 1));
+  validator.DeclInputVar("concat_x3", nvinfer1::DimsCHW(7, 3, 1));
+  validator.DeclOutputVar("concat_out", nvinfer1::DimsCHW(20, 3, 1));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("concat");
+  desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
+  desc.SetOutput("Out", {"concat_out"});
+  int axis = 1;
+  desc.SetAttr("axis", axis);
+  validator.SetOp(*desc.Proto());
+  validator.Execute(5);
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(concat);
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -135,6 +136,15 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
  return feed_target_shapes;
 }
+void Compile(paddle::framework::ProgramDesc* program) {
+  std::unique_ptr<paddle::framework::ir::Graph> g(
+      new paddle::framework::ir::Graph(*program));
+  auto pass = paddle::framework::ir::PassRegistry::Instance().Get(
+      "graph_to_program_pass");
+  pass->SetNotOwned<paddle::framework::ProgramDesc>("program", program);
+  pass->Apply(std::move(g));
+}
 template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
@@ -172,6 +182,8 @@ void TestInference(const std::string& dirname,
        paddle::platform::DeviceContextPool::Instance().Get(place));
    inference_program = InitProgram(&executor, scope, dirname, is_combined);
  }
+  Compile(inference_program.get());
  // Disable the profiler and print the timing information
  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
                                    "load_program_profiler");
@@ -249,3 +261,5 @@ void TestInference(const std::string& dirname,
  delete scope;
 }
+USE_PASS(graph_to_program_pass);
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -291,6 +291,8 @@ op_library(unsqueeze_op DEPS reshape_op)
 op_library(squeeze_op DEPS reshape_op)
 op_library(extract_rows_op DEPS memory)
 op_library(flatten_op DEPS reshape_op)
+op_library(sequence_pad_op DEPS sequence_padding)
+op_library(unstack_op DEPS stack_op)
 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)

--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -56,7 +56,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
  const int D = w_dims[1] / 4;
  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(LSTMWeight)'s rank must be 2.");
  PADDLE_ENFORCE_EQ(w_dims[0], D + M,
-                    "LSTMWeight dims should be (%d + %d) * %d.", D + M, 4 * D);
+                    "LSTMWeight dims should be (%d + %d) * %d.", D, M, 4 * D);
  auto b_dims = ctx->GetInputDim("LSTMBias");
  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2.");

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -60,20 +60,6 @@ class AucKernel : public framework::OpKernel<T> {
    const T* inference_data = predict->data<T>();
    const auto* label_data = label->data<int64_t>();
-    // check if states are inited.
-    auto* tp_in = ctx.Input<Tensor>("TP");
-    auto* fp_in = ctx.Input<Tensor>("FP");
-    auto* tn_in = ctx.Input<Tensor>("TN");
-    auto* fn_in = ctx.Input<Tensor>("FN");
-    PADDLE_ENFORCE(tp_in->IsInitialized(), "true_positive is not inited!");
-    PADDLE_ENFORCE(fp_in->IsInitialized(), "false_negative is not inited!");
-    PADDLE_ENFORCE(tn_in->IsInitialized(), "true_negative is not inited!");
-    PADDLE_ENFORCE(fn_in->IsInitialized(), "false_positive is not inited!");
-    PADDLE_ENFORCE_EQ(tp_in->numel(), num_thresholds, "");
-    PADDLE_ENFORCE_EQ(fp_in->numel(), num_thresholds, "");
-    PADDLE_ENFORCE_EQ(tn_in->numel(), num_thresholds, "");
-    PADDLE_ENFORCE_EQ(fn_in->numel(), num_thresholds, "");
    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
    auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());

--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -37,6 +37,95 @@ struct bn_type_traits {
  using op_prim = typename op_type::primitive_desc;
 };
+class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
+ public:
+  BatchNormMKLDNNHandler(
+      std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_pd,
+      const platform::MKLDNNDeviceContext &dev_ctx, mkldnn::engine engine,
+      const std::string &base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
+    batch_norm_pd_ = batch_norm_pd;
+  }
+  std::shared_ptr<memory> AcquireScaleshiftMemoryFromPrimitive(void *ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        batch_norm_pd_->weights_primitive_desc(), ptr, "@scaleshift_mem_p");
+  }
+  std::shared_ptr<memory> AcquireMeanMemoryFromPrimitive(void *ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        batch_norm_pd_->mean_primitive_desc(), ptr, "@mean_mem_p");
+  }
+  std::shared_ptr<memory> AcquireVarianceMemoryFromPrimitive(void *ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p");
+  }
+  std::shared_ptr<batch_norm_fwd> AcquireTestTrainingBatchNormFwd(
+      std::shared_ptr<memory> src_memory,
+      std::shared_ptr<memory> scaleshift_memory,
+      std::shared_ptr<memory> dst_memory, std::shared_ptr<memory> mean_memory,
+      std::shared_ptr<memory> variance_memory, bool is_test) {
+    auto prim_key = key_ + "@batch_norm_p";
+    auto batch_norm_p =
+        std::static_pointer_cast<batch_norm_fwd>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((batch_norm_p != nullptr) || !is_reusing_,
+                   "Fail to find batch norm primitive in device context");
+    if (batch_norm_p == nullptr) {
+      if (is_test) {
+        batch_norm_p = std::make_shared<batch_norm_fwd>(
+            *batch_norm_pd_, *src_memory,
+            (const mkldnn::primitive::at &)*mean_memory,
+            (const mkldnn::primitive::at &)*variance_memory, *scaleshift_memory,
+            *dst_memory);
+      } else {
+        batch_norm_p = std::make_shared<batch_norm_fwd>(
+            *batch_norm_pd_, *src_memory, *scaleshift_memory, *dst_memory,
+            *mean_memory, *variance_memory);
+      }
+      dev_ctx_.SetBlob(prim_key, batch_norm_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return batch_norm_p;
+  }
+  static std::string GetHash(const memory::dims &input_dims, float epsilon,
+                             unsigned flag, bool is_test, memory::format format,
+                             const std::string &suffix = "") {
+    auto dims2str = [](const memory::dims &operand_dims) {
+      std::string dstr = "";
+      for (size_t i = 0; i < operand_dims.size(); ++i) {
+        dstr += std::to_string(operand_dims[i]) + "-";
+      }
+      return dstr;
+    };
+    return dims2str(input_dims) + std::to_string(epsilon) +
+           std::to_string(flag) + std::to_string(is_test) +
+           std::to_string(format) + suffix;
+  }
+ private:
+  std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_pd_;
+};
+std::shared_ptr<memory> UpdateMemoryData(
+    const platform::MKLDNNDeviceContext &dev_ctx, const std::string &key,
+    void *new_ptr) {
+  auto mem = std::static_pointer_cast<memory>(dev_ctx.GetBlob(key));
+  PADDLE_ENFORCE(
+      mem != nullptr,
+      (std::string("Fail to find memory in device context [key: ") + key + "]")
+          .c_str());
+  mem->set_data_handle(new_ptr);
+  return mem;
+}
 template <typename T, typename Container>
 void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end,
                     Container *c) {
@@ -48,15 +137,6 @@ void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end,
      std::inserter(*c, std::next(it, std::distance(scale_begin, scale_end))));
 }
-template <typename Op, typename... Args>
-void run_batch_norm_op(Args &&... args) {
-  Op batch_norm_op{args...};
-  std::vector<mkldnn::primitive> pipeline;
-  pipeline.push_back(batch_norm_op);
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-}
 }  // namespace
 template <typename T>
@@ -110,6 +190,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
    const unsigned int ic = scale_tz[0];
+    // MKLDNN requires a single piece of memory for scale and shift/bias data
+    const size_t scaleshift_size = 2 * ic;
+    std::vector<T> scaleshift_data;
+    scaleshift_data.reserve(scaleshift_size);
+    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
+                    shift->data<T>() + ic, &scaleshift_data);
    unsigned flags = mkldnn::use_scale_shift;
    if (is_test) flags |= mkldnn::use_global_stats;
    if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
@@ -118,64 +206,69 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    mkldnn::memory::format input_format =
        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
-    auto src_memory = memory(
+    // keys for backward pass
-        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+    const std::string key = BatchNormMKLDNNHandler::GetHash(
-        to_void_cast(x_data));
+        src_tz, epsilon, flags, is_test, input_format,
+        ctx.op().Output("SavedMean"));
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input_format);
    // create primitive descriptor for batch norm forward
    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
-    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
+    auto batch_norm_fwd_desc =
-        propagation, src_memory.get_primitive_desc().desc(), epsilon, flags};
+        bn_fwd_types::op_desc{propagation, user_src_md, epsilon, flags};
-    std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_fwd_pd =
+    auto batch_norm_fwd_pd = std::make_shared<batch_norm_fwd::primitive_desc>(
-        std::shared_ptr<batch_norm_fwd::primitive_desc>(
+        batch_norm_fwd_desc, mkldnn_engine);
-            new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc,
+    // Save conv_pd/src_memory/weights_memory for backward pass
-                                               mkldnn_engine));
-    // Save the pd to be used in backward pass
-    const std::string key = ctx.op().Output("SavedMean");
-    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
    dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);
-    // MKLDNN requires a single piece of memory for scale and shift/bias data
+    BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine,
-    const size_t scaleshift_size = 2 * ic;
+                                   key);
-    std::vector<T> scaleshift_data;
-    scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
+    auto src_memory =
-                    shift->data<T>() + ic, &scaleshift_data);
+        handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data));
    // crate mkldnn memory for weights(scale/shift)
-    auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(),
+    auto scaleshift_memory =
-                                    scaleshift_data.data());
+        handler.AcquireScaleshiftMemoryFromPrimitive(scaleshift_data.data());
    // create mkldnn memory for output y tensor
-    auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data);
+    auto dst_memory = handler.AcquireDstMemory(
+        batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data);
+    std::shared_ptr<batch_norm_fwd> batch_norm_p;
    if (is_test) {
      // create mkldnn memory for stats (as input)
-      auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(),
+      std::shared_ptr<memory> mean_memory =
-                                to_void_cast(mean_data));
+          handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data));
-      auto variance_memory =
+      std::shared_ptr<memory> variance_memory =
-          memory(batch_norm_fwd_pd->variance_primitive_desc(),
+          handler.AcquireVarianceMemoryFromPrimitive(
-                 to_void_cast(variance_data));
+              to_void_cast(variance_data));
-      run_batch_norm_op<typename bn_fwd_types::op_type>(
+      batch_norm_p = handler.AcquireTestTrainingBatchNormFwd(
-          *batch_norm_fwd_pd, src_memory,
+          src_memory, scaleshift_memory, dst_memory, mean_memory,
-          (const mkldnn::primitive::at &)mean_memory,
+          variance_memory, true);
-          (const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
-          dst_memory);
    } else {
      // create mkldnn memory for stats (as output)
-      auto mean_memory =
+      std::shared_ptr<memory> mean_memory =
-          memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data);
+          handler.AcquireMeanMemoryFromPrimitive(batch_mean_data);
-      auto variance_memory = memory(
+      std::shared_ptr<memory> variance_memory =
-          batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data);
+          handler.AcquireVarianceMemoryFromPrimitive(batch_variance_data);
-      run_batch_norm_op<bn_fwd_types::op_type>(*batch_norm_fwd_pd, src_memory,
+      batch_norm_p = handler.AcquireTestTrainingBatchNormFwd(
-                                               scaleshift_memory, dst_memory,
+          src_memory, scaleshift_memory, dst_memory, mean_memory,
-                                               mean_memory, variance_memory);
+          variance_memory, false);
    }
+    y->set_layout(DataLayout::kMKLDNN);
+    y->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    std::vector<mkldnn::primitive> pipeline;
+    pipeline.push_back(*batch_norm_p);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
    if (!is_test) {
      // mkldnn only compute stats for current batch
      // so we need compute momentum stats via Eigen lib
@@ -192,10 +285,6 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      running_variance_e =
          variance_e * momentum + batch_variance_e * one_minus_momentum;
    }
-    y->set_layout(DataLayout::kMKLDNN);
-    y->set_format(
-        (memory::format)dst_memory.get_primitive_desc().desc().data.format);
  }
 };
@@ -242,61 +331,48 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    const unsigned int ic = scale_tz[0];
-    // Retrieve bn_fwd_pd from device context
-    const std::string key = ctx.op().Input("SavedMean");
-    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
-    auto batch_norm_fwd_pd =
-        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
-            dev_ctx.GetBlob(key_batch_norm_fwd_pd));
-    PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
-                   "Fail to find batch_norm_fwd_pd in device context");
    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
-    // create mkldnn memory from input diff_y tensor
    mkldnn::memory::format dst_format =
        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
-    auto user_diff_dst_memory = memory(
-        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
-        to_void_cast(diff_y_data));
-    // create mkldnn memory from input x tensor
    mkldnn::memory::format input_format =
        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
-    auto src_memory = memory(
+    unsigned flags = mkldnn::use_scale_shift;
-        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
-        to_void_cast(x_data));
-    // for diff_dst, try to use same format as dst in forward pass
+    // keys from forward pass
-    auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
+    const std::string key = BatchNormMKLDNNHandler::GetHash(
-    auto diff_dst_md = diff_dst_pd.desc();
+        src_tz, epsilon, flags, false, input_format,
+        ctx.op().Input("SavedMean"));
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    // keys for primitives reuse
+    const std::string key_with_hash =
+        key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false,
+                                              input_format);
+    const std::string key_batch_norm_bwd_p =
+        key_with_hash + "@batch_norm_bwd_p";
+    const std::string key_batch_norm_src_mem_p =
+        key_with_hash + "@batch_norm_bwd_src_mem_p";
+    const std::string key_batch_norm_mean_mem_p =
+        key_with_hash + "@batch_norm_bwd_mean_mem_p";
+    const std::string key_batch_norm_variance_mem_p =
+        key_with_hash + "@batch_norm_bwd_variance_mem_p";
+    const std::string key_batch_norm_scaleshift_mem_p =
+        key_with_hash + "@batch_norm_bwd_scaleshift_mem_p";
+    const std::string key_batch_norm_diff_scaleshift_mem_p =
+        key_with_hash + "@batch_norm_bwd_diff_scaleshift_mem_p";
+    const std::string key_batch_norm_diff_src_mem_p =
+        key_with_hash + "@batch_norm_bwd_diff_src_mem_p";
+    const std::string key_batch_norm_diff_dst_mem_p =
+        key_with_hash + "@batch_norm_bwd_diff_dst_mem_p";
-    // create primitive descriptor for batch norm backward
-    unsigned flags = mkldnn::use_scale_shift;
-    auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
-        mkldnn::prop_kind::backward, diff_dst_md,
-        src_memory.get_primitive_desc().desc(), epsilon, flags};
-    auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
-        batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
-    // reorder user_diff_dst if it's not in preferred format
-    auto diff_dst_memory = user_diff_dst_memory;
    primitive reorder_diff_dst;
    bool is_diff_dst_reordered = false;
-    if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
+    auto user_diff_dst_memory = memory(
-      diff_dst_memory = memory(diff_dst_pd);
+        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
-      reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory);
+        to_void_cast(diff_y_data));
-      is_diff_dst_reordered = true;
-    }
-    // create mkldnn memory for input tensors (src/mean/variance)
-    auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(),
-                              to_void_cast(batch_mean_data));
-    auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(),
-                                  to_void_cast(batch_variance_data));
    // MKLDNN requires a single piece of memory for scale and shift/bias data
    const size_t scaleshift_size = 2 * ic;
@@ -306,30 +382,118 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic,
                    &scaleshift_data);
-    // create mkldnn memory for input tensors (scale/shift)
-    auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(),
-                                    scaleshift_data.data());
-    // create mkldnn memory for output diff weights (combined scale/shift)
    std::vector<T> diff_scaleshift_data;
    diff_scaleshift_data.reserve(scaleshift_size);
-    auto diff_scaleshift_memory =
-        memory(batch_norm_bwd_pd.diff_weights_primitive_desc(),
-               diff_scaleshift_data.data());
-    // here assume diff_src is in the same format of src
+    auto batch_norm_fwd_pd =
-    auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data);
+        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
+            dev_ctx.GetBlob(key_batch_norm_fwd_pd));
+    PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
+                   "Fail to find batch_norm_fwd_pd in device context");
-    // finally create batch_norm backward primitive
+    auto batch_norm_bwd_p = std::static_pointer_cast<batch_norm_bwd>(
-    auto batch_norm_bwd_prim =
+        dev_ctx.GetBlob(key_batch_norm_bwd_p));
-        batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory,
-                       variance_memory, diff_dst_memory, scaleshift_memory,
+    if (batch_norm_bwd_p == nullptr) {
-                       diff_src_memory, diff_scaleshift_memory);
+      auto src_memory = std::shared_ptr<memory>(new memory(
+          {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+          to_void_cast(x_data)));
+      // for diff_dst, try to use same format as dst in forward pass
+      auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
+      auto diff_dst_md = diff_dst_pd.desc();
+      // create primitive descriptor for batch norm backward
+      auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
+          mkldnn::prop_kind::backward, diff_dst_md,
+          src_memory->get_primitive_desc().desc(), epsilon, flags};
+      auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
+          batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
+      // reorder user_diff_dst if it's not in preferred format
+      auto diff_dst_memory = std::make_shared<memory>(user_diff_dst_memory);
+      if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory = std::make_shared<memory>(diff_dst_pd);
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
+      // create mkldnn memory for input tensors (src/mean/variance)
+      auto mean_memory =
+          std::make_shared<memory>(batch_norm_bwd_pd.mean_primitive_desc(),
+                                   to_void_cast(batch_mean_data));
+      auto variance_memory =
+          std::make_shared<memory>(batch_norm_bwd_pd.variance_primitive_desc(),
+                                   to_void_cast(batch_variance_data));
+      // create mkldnn memory for input tensors (scale/shift)
+      auto scaleshift_memory = std::make_shared<memory>(
+          batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data());
+      // create mkldnn memory for output diff weights (combined scale/shift)
+      auto diff_scaleshift_memory = std::make_shared<memory>(
+          batch_norm_bwd_pd.diff_weights_primitive_desc(),
+          diff_scaleshift_data.data());
+      // here assume diff_src is in the same format of src
+      auto diff_src_memory = std::make_shared<memory>(
+          src_memory->get_primitive_desc(), diff_x_data);
+      // finally create batch_norm backward primitive
+      batch_norm_bwd_p = std::make_shared<batch_norm_bwd>(
+          batch_norm_bwd_pd, *src_memory, *mean_memory, *variance_memory,
+          *diff_dst_memory, *scaleshift_memory, *diff_src_memory,
+          *diff_scaleshift_memory);
+      dev_ctx.SetBlob(key_batch_norm_bwd_p, batch_norm_bwd_p);
+      dev_ctx.SetBlob(key_batch_norm_src_mem_p, src_memory);
+      dev_ctx.SetBlob(key_batch_norm_mean_mem_p, mean_memory);
+      dev_ctx.SetBlob(key_batch_norm_variance_mem_p, variance_memory);
+      dev_ctx.SetBlob(key_batch_norm_scaleshift_mem_p, scaleshift_memory);
+      dev_ctx.SetBlob(key_batch_norm_diff_scaleshift_mem_p,
+                      diff_scaleshift_memory);
+      dev_ctx.SetBlob(key_batch_norm_diff_src_mem_p, diff_src_memory);
+      dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory);
+      // set layout/format of output tensors
+      diff_x->set_layout(DataLayout::kMKLDNN);
+      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
+                             .desc()
+                             .data.format);
+    } else {
+      // primitives already exist
+      UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data));
+      UpdateMemoryData(dev_ctx, key_batch_norm_mean_mem_p,
+                       to_void_cast(batch_mean_data));
+      UpdateMemoryData(dev_ctx, key_batch_norm_variance_mem_p,
+                       to_void_cast(batch_variance_data));
+      UpdateMemoryData(dev_ctx, key_batch_norm_scaleshift_mem_p,
+                       scaleshift_data.data());
+      UpdateMemoryData(dev_ctx, key_batch_norm_diff_scaleshift_mem_p,
+                       diff_scaleshift_data.data());
+      auto diff_src_memory = UpdateMemoryData(
+          dev_ctx, key_batch_norm_diff_src_mem_p, to_void_cast(diff_x_data));
+      auto diff_dst_memory = UpdateMemoryData(
+          dev_ctx, key_batch_norm_diff_dst_mem_p, to_void_cast(diff_y_data));
+      // reorder user_diff_dst if it's not in preferred format
+      if (diff_dst_memory->get_primitive_desc() !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
+      // set layout/format of output tensors
+      diff_x->set_layout(DataLayout::kMKLDNN);
+      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
+                             .desc()
+                             .data.format);
+    }
    // execute optional reorder and batch_norm backward primitive
    std::vector<primitive> pipeline;
    if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst);
-    pipeline.push_back(batch_norm_bwd_prim);
+    pipeline.push_back(*batch_norm_bwd_p);
    stream(stream::kind::eager).submit(pipeline).wait();
    // copy back diff sacle/shift to output tensors (diff scale/shift)
@@ -338,12 +502,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    std::copy(it, std::next(it, ic), diff_scale_data);
    std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
              diff_shift_data);
-    // set layout/format of output tensors
-    diff_x->set_layout(DataLayout::kMKLDNN);
-    diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc()
-                           .desc()
-                           .data.format);
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -29,6 +29,6 @@ target_assign_op.cu)
 detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 polygon_box_transform_op.cu)
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
+detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
-# Export local libraries to parent
+#Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+struct AppendProposalsFunctor {
+  LoDTensor *out_;
+  int64_t offset_;
+  Tensor *to_add_;
+  AppendProposalsFunctor(LoDTensor *out, int64_t offset, Tensor *to_add)
+      : out_(out), offset_(offset), to_add_(to_add) {}
+  template <typename T>
+  void operator()() const {
+    auto *out_data = out_->data<T>();
+    auto *to_add_data = to_add_->data<T>();
+    memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T));
+  }
+};
+class GenerateProposalsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Scores"), "Input(Scores) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("BboxDeltas"),
+                   "Input(BboxDeltas) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Anchors"),
+                   "Input(Anchors) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variances"),
+                   "Input(Variances) shouldn't be null.");
+    auto scores_dims = ctx->GetInputDim("Scores");
+    auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas");
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
+    auto anchors_dims = ctx->GetInputDim("Anchors");
+    auto variances_dims = ctx->GetInputDim("Variances");
+    ctx->SetOutputDim("RpnRois", {-1, 4});
+    ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
+        platform::CPUPlace());
+  }
+};
+template <class T>
+void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
+              Tensor *bbox_deltas, Tensor *variances, Tensor *proposals) {
+  T *proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
+  int64_t row = all_anchors->dims()[0];
+  int64_t len = all_anchors->dims()[1];
+  auto *bbox_deltas_data = bbox_deltas->data<T>();
+  auto *anchor_data = all_anchors->data<T>();
+  const T *variances_data = nullptr;
+  if (variances) {
+    variances_data = variances->data<T>();
+  }
+  for (int64_t i = 0; i < row; ++i) {
+    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len];
+    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1];
+    T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2;
+    T anchor_center_y =
+        (anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2;
+    T bbox_center_x = 0, bbox_center_y = 0;
+    T bbox_width = 0, bbox_height = 0;
+    if (variances) {
+      bbox_center_x =
+          variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
+          anchor_center_x;
+      bbox_center_y = variances_data[i * len + 1] *
+                          bbox_deltas_data[i * len + 1] * anchor_height +
+                      anchor_center_y;
+      bbox_width = std::exp(variances_data[i * len + 2] *
+                            bbox_deltas_data[i * len + 2]) *
+                   anchor_width;
+      bbox_height = std::exp(variances_data[i * len + 3] *
+                             bbox_deltas_data[i * len + 3]) *
+                    anchor_height;
+    } else {
+      bbox_center_x =
+          bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
+      bbox_center_y =
+          bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
+      bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
+      bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
+    }
+    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
+    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
+    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
+    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
+  }
+  // return proposals;
+}
+template <class T>
+void ClipTiledBoxes(const platform::DeviceContext &ctx, const Tensor &im_info,
+                    Tensor *boxes) {
+  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
+  const T *im_info_data = im_info.data<T>();
+  for (int64_t i = 0; i < boxes->numel(); ++i) {
+    if (i % 4 == 0) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f);
+    } else if (i % 4 == 1) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f);
+    } else if (i % 4 == 2) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f);
+    } else {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f);
+    }
+  }
+}
+template <class T>
+void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
+                 float min_size, const Tensor &im_info, Tensor *keep) {
+  const T *im_info_data = im_info.data<T>();
+  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
+  min_size *= im_info_data[2];
+  keep->Resize({boxes->dims()[0], 1});
+  int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
+  int keep_len = 0;
+  for (int i = 0; i < boxes->dims()[0]; ++i) {
+    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
+    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
+    T x_ctr = boxes_data[4 * i] + ws / 2;
+    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
+    if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] &&
+        y_ctr <= im_info_data[0]) {
+      keep_data[keep_len++] = i;
+    }
+  }
+  keep->Resize({keep_len});
+}
+bool SortScorePairDescend(const std::pair<float, int> &pair1,
+                          const std::pair<float, int> &pair2) {
+  return pair1.first > pair2.first;
+}
+template <class T>
+void GetMaxScoreIndex(const std::vector<T> &scores,
+                      std::vector<std::pair<T, int>> *sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    sorted_indices->push_back(std::make_pair(scores[i], i));
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend);
+}
+template <class T>
+T BBoxArea(const T *box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+template <class T>
+T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+template <class T>
+Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
+           const T nms_threshold, const float eta) {
+  PADDLE_ENFORCE_NOT_NULL(bbox);
+  int64_t num_boxes = bbox->dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  int64_t box_size = bbox->dims()[1];
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex<T>(scores_data, &sorted_indices);
+  std::vector<int> selected_indices;
+  int selected_num = 0;
+  T adaptive_threshold = nms_threshold;
+  const T *bbox_data = bbox->data<T>();
+  bool flag;
+  while (sorted_indices.size() != 0) {
+    int idx = sorted_indices.front().second;
+    flag = true;
+    for (size_t k = 0; k < selected_indices.size(); ++k) {
+      if (flag) {
+        const int kept_idx = selected_indices[k];
+        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size, false);
+        flag = (overlap <= adaptive_threshold);
+      } else {
+        break;
+      }
+    }
+    if (flag) {
+      selected_indices.push_back(idx);
+      selected_num++;
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (flag && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+  Tensor keep_nms;
+  keep_nms.Resize({selected_num});
+  int *keep_data = keep_nms.mutable_data<int>(ctx.GetPlace());
+  for (int i = 0; i < selected_num; ++i) {
+    keep_data[i] = selected_indices[i];
+  }
+  return keep_nms;
+}
+template <typename DeviceContext, typename T>
+class GenerateProposalsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *scores = context.Input<Tensor>("Scores");
+    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
+    auto *im_info = context.Input<Tensor>("ImInfo");
+    auto *anchors = context.Input<Tensor>("Anchors");
+    auto *variances = context.Input<Tensor>("Variances");
+    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
+    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
+    int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
+    int post_nms_top_n = context.Attr<int>("post_nms_topN");
+    float nms_thresh = context.Attr<float>("nms_thresh");
+    float min_size = context.Attr<float>("min_size");
+    float eta = context.Attr<float>("eta");
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    auto scores_dim = scores->dims();
+    int64_t num = scores_dim[0];
+    int64_t c_score = scores_dim[1];
+    int64_t h_score = scores_dim[2];
+    int64_t w_score = scores_dim[3];
+    auto bbox_dim = bbox_deltas->dims();
+    int64_t c_bbox = bbox_dim[1];
+    int64_t h_bbox = bbox_dim[2];
+    int64_t w_bbox = bbox_dim[3];
+    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
+                              context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel() / 4, 1},
+                                   context.GetPlace());
+    Tensor bbox_deltas_swap, scores_swap;
+    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
+                                     dev_ctx.GetPlace());
+    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
+                                dev_ctx.GetPlace());
+    math::Transpose<DeviceContext, T, 4> trans;
+    std::vector<int> axis = {0, 2, 3, 1};
+    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
+    trans(dev_ctx, *scores, &scores_swap, axis);
+    framework::LoD lod;
+    std::vector<size_t> lod0(1, 0);
+    Tensor *anchor = const_cast<framework::Tensor *>(anchors);
+    anchor->Resize({anchors->numel() / 4, 4});
+    Tensor *var = const_cast<framework::Tensor *>(variances);
+    var->Resize({var->numel() / 4, 4});
+    int64_t num_proposals = 0;
+    for (int64_t i = 0; i < num; ++i) {
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
+      scores_slice.Resize({h_score * w_score * c_score, 1});
+      std::pair<Tensor, Tensor> tensor_pair =
+          ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var,
+                              bbox_deltas_slice, scores_slice, pre_nms_top_n,
+                              post_nms_top_n, nms_thresh, min_size, eta);
+      Tensor proposals = tensor_pair.first;
+      Tensor scores = tensor_pair.second;
+      framework::VisitDataType(
+          framework::ToDataType(rpn_rois->type()),
+          AppendProposalsFunctor(rpn_rois, 4 * num_proposals, &proposals));
+      framework::VisitDataType(
+          framework::ToDataType(rpn_roi_probs->type()),
+          AppendProposalsFunctor(rpn_roi_probs, num_proposals, &scores));
+      num_proposals += proposals.dims()[0];
+      lod0.emplace_back(num_proposals);
+    }
+    lod.emplace_back(lod0);
+    rpn_rois->set_lod(lod);
+    rpn_roi_probs->set_lod(lod);
+    rpn_rois->Resize({num_proposals, 4});
+    rpn_roi_probs->Resize({num_proposals, 1});
+  }
+  std::pair<Tensor, Tensor> ProposalForOneImage(
+      const DeviceContext &ctx, const Tensor &im_info_slice,
+      const Tensor &anchors, const Tensor &variances,
+      const Tensor &bbox_deltas_slice,  // [M, 4]
+      const Tensor &scores_slice,       // [N, 1]
+      int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
+      float eta) const {
+    auto *scores_data = scores_slice.data<T>();
+    // Sort index
+    Tensor index_t;
+    index_t.Resize({scores_slice.numel()});
+    int *index = index_t.mutable_data<int>(ctx.GetPlace());
+    for (int i = 0; i < scores_slice.numel(); ++i) {
+      index[i] = i;
+    }
+    std::function<bool(const int64_t &, const int64_t &)> compare =
+        [scores_data](const int64_t &i, const int64_t &j) {
+          return scores_data[i] > scores_data[j];
+        };
+    if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
+      std::sort(index, index + scores_slice.numel(), compare);
+    } else {
+      std::nth_element(index, index + pre_nms_top_n,
+                       index + scores_slice.numel(), compare);
+      index_t.Resize({pre_nms_top_n});
+    }
+    Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
+    scores_sel.mutable_data<T>({index_t.numel(), 1}, ctx.GetPlace());
+    bbox_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+    anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+    var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    CPUGather<T>(ctx, variances, index_t, &var_sel);
+    Tensor proposals;
+    proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+    BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);
+    ClipTiledBoxes<T>(ctx, im_info_slice, &proposals);
+    Tensor keep;
+    FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, &keep);
+    Tensor scores_filter;
+    bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
+    scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
+    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    if (nms_thresh <= 0) {
+      return std::make_pair(bbox_sel, scores_sel);
+    }
+    Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
+    if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
+      keep_nms.Resize({post_nms_top_n});
+    }
+    proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
+    scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
+    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+    return std::make_pair(proposals, scores_sel);
+  }
+};
+class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Scores", "The scores of anchors should be foreground.");
+    AddInput("BboxDeltas", "bbox_deltas.");
+    AddInput("ImInfo", "Information for image reshape.");
+    AddInput("Anchors", "All anchors.");
+    AddInput("Variances", " variances");
+    AddOutput("RpnRois", "Anchors.");
+    AddOutput("RpnRoiProbs", "Anchors.");
+    AddAttr<int>("pre_nms_topN", "pre_nms_topN");
+    AddAttr<int>("post_nms_topN", "post_nms_topN");
+    AddAttr<float>("nms_thresh", "nms_thres");
+    AddAttr<float>("min_size", "min size");
+    AddAttr<float>("eta", "eta");
+    AddComment(R"DOC(
+Generate Proposals OP
+This operator proposes rois according to each box with their probability to be a foreground object and 
+the box can be calculated by anchors. Bbox_deltais and scores are the output of RPN. Final proposals
+could be used to train detection net.
+Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
+of anchors, H and W are height and width of the feature map.
+BboxDeltas is the differece between predicted box locatoin and anchor location. In format of (N, 4*A, H, W)
+For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and 
+ calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area. 
+Finally, apply nms to get final proposals as output.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp,
+                  ops::GenerateProposalsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    generate_proposals,
+    ops::GenerateProposalsKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -18,15 +18,32 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+template <typename T>
+struct DequantizeFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor* scale,
+                  T max_range, framework::Tensor* out) {
+    auto in_e = framework::EigenVector<T>::Flatten(*in);
+    const T* scale_factor = scale->data<T>();
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+    auto& dev = *dev_ctx.eigen_device();
+    out_e.device(dev) = (scale_factor[0] / max_range) * in_e;
+  }
+};
+template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
+template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
 class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
 public:
-  FakeDequantizeMaxAbsOp(const std::string &type,
+  FakeDequantizeMaxAbsOp(const std::string& type,
-                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap& inputs,
-                         const framework::VariableNameMap &outputs,
+                         const framework::VariableNameMap& outputs,
-                         const framework::AttributeMap &attrs)
+                         const framework::AttributeMap& attrs)
      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of FakeDequantizeMaxAbsOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -42,21 +59,17 @@ class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X",
             "(Tensor) The input with float-32/64 type is the "
             "low precision tensor.");
+    AddInput("Scale", "(float) The scale in quantization stage.");
    AddOutput("Out",
              "(Tensor) The output is the dequantized high "
              "precision tensor.");
-    AddAttr<int>("num_bits",
+    AddAttr<float>("max_range", "(float) The max range in quantization stage.");
-                 "(int) `num_bits` is the quantization level bits, "
-                 "such as 2, 5, 8.");
-    AddAttr<float>("scale",
-                   "(float) The maximum absolute value of low precision tensor."
-                   "It is usually calculated by the fake_quantize_max_abs_op.");
    AddComment(R"DOC(
 FakeDequantizeMaxAbsOp operator.
 This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
-$$Out = \frac{scale*X}{2^{num_bits} - 1}$$
+$$Out = \frac{scale*X}{ max_range }$$
 )DOC");
  }

--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -14,6 +14,42 @@ limitations under the License. */
 #include "paddle/fluid/operators/fake_dequantize_op.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+__global__ void KeDequantize(const T* in, const T* scale, T max_range, int num,
+                             T* out) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < num) {
+    out[idx] = in[idx] * scale[0] / max_range;
+  }
+}
+template <typename T>
+struct DequantizeFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor* scale,
+                  T max_range, framework::Tensor* out) {
+    const T* in_data = in->data<T>();
+    const T* scale_factor = scale->data<T>();
+    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+    int num = in->numel();
+    int block = 512;
+    int grid = (num + block - 1) / block;
+    KeDequantize<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        in_data, scale_factor, max_range, num, out_data);
+  }
+};
+template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
+template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
+}  // namespace operators
+}  // namespace paddle
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,

--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -19,22 +19,29 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+template <typename DeviceContext, typename T>
+struct DequantizeFunctor {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
+                  const framework::Tensor* scale, T max_range,
+                  framework::Tensor* out);
+};
 template <typename DeviceContext, typename T>
 class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& ctx) const {
    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(in->place());
-    int num_bits = ctx.Attr<int>("num_bits");
+    float max_range = ctx.Attr<float>("max_range");
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    int range = std::pow(2, num_bits) - 1;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    out->mutable_data<T>(dev_ctx.GetPlace());
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    DequantizeFunctor<DeviceContext, T>()(dev_ctx, in, scale,
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+                                          static_cast<T>(max_range), out);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(dev) = (scale / range) * eigen_in;
  }
 };

--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -52,6 +52,8 @@ class FetchBarrierOp : public framework::OperatorBase {
 class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() {
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
    AddComment(R"DOC(
 SendBarrier operator

--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/fusion_gru_op.h"
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+namespace paddle {
+namespace operators {
+void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
+                 "Input(WeightX) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
+                 "Input(WeightH) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
+                 "Output(BatchedGate) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
+                 "Output(BatchResetHiddenPrev) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
+                 "Output(BatchedHidden) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                 "Output(Hidden) of GRU should not be null.");
+  auto x_dims = ctx->GetInputDim("X");
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+  auto wx_dims = ctx->GetInputDim("WeightX");
+  PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
+                    "The rank of Input(WeightX) should be 2.");
+  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
+                    "The first dimension of Input(WeightX) "
+                    "should be %d.",
+                    x_dims[1]);
+  int frame_size = wx_dims[1] / 3;
+  auto wh_dims = ctx->GetInputDim("WeightH");
+  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
+                    "The rank of Input(WeightH) should be 2.");
+  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
+                    "The first dimension of Input(WeightH) "
+                    "should be %d.",
+                    frame_size);
+  PADDLE_ENFORCE_EQ(wh_dims[1], 3 * frame_size,
+                    "The second dimension of Input(WeightH) "
+                    "should be 3 * %d.",
+                    frame_size);
+  if (ctx->HasInput("H0")) {
+    auto h0_dims = ctx->GetInputDim("H0");
+    PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                      "The width of H0 must be equal to frame_size.");
+  }
+  if (ctx->HasInput("Bias")) {
+    auto b_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+    PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                      "The first dimension of Input(Bias) should be 1.");
+    PADDLE_ENFORCE_EQ(b_dims[1], frame_size * 3,
+                      "The shape of Bias must be [1, frame_size * 3].");
+  }
+  framework::DDim out_dims({x_dims[0], frame_size});
+  ctx->SetOutputDim("Hidden", out_dims);
+  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
+  ctx->SetOutputDim("BatchedHidden", out_dims);
+  ctx->SetOutputDim("BatchResetHiddenPrev", out_dims);
+  ctx->ShareLoD("X", "Hidden");
+  int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->ShareLoD("X", "XX");
+}
+framework::OpKernelType FusionGRUOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+      ctx.device_context());
+}
+void FusionGRUOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) the input is a LodTensor, which support "
+           "variable-time length input sequence. The underlying tensor in "
+           "this LoDTensor is a matrix with shape (T X M), where T is the "
+           "total time steps in this mini-batch, M is the dim size of x.");
+  AddInput("H0",
+           "(Tensor, optional) The initial hidden state is an optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size, D is the hidden size.")
+      .AsDispensable();
+  AddInput("WeightX",
+           "(Tensor) The FC weight with shape (M x 3D),"
+           "where M is the dim size of x, D is the hidden size. ");
+  AddInput("WeightH",
+           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. ");
+  AddInput("Bias",
+           "(Tensor, optional) (1 x 3D)."
+           "Almost same as GRUOp."
+           "Note: if have FC bias it should be added on this bias.")
+      .AsDispensable();
+  AddOutput("XX",
+            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
+            " or batched_X (size is T x M), this will be automatically chosen,"
+            " where T is the total time steps in this mini-batch,"
+            " D is the hidden size, M is the dim size of x input.")
+      .AsIntermediate();
+  AddOutput("BatchedGate", "(LoDTensor) Same as GRUOp").AsIntermediate();
+  AddOutput("BatchResetHiddenPrev", "(LoDTensor) (T x 3D) Same as GRUOp.")
+      .AsIntermediate();
+  AddOutput("BatchedHidden", "(LoDTensor) (T X D) Same as GRUOp.")
+      .AsIntermediate();
+  AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp");
+  AddAttr<std::string>("activation",
+                       "(string, default tanh) "
+                       "The activation type used for output candidate {h}_t.")
+      .SetDefault("tanh");
+  AddAttr<std::string>(
+      "gate_activation",
+      "(string, default sigmoid) "
+      "The activation type used in update gate and reset gate.")
+      .SetDefault("sigmoid");
+  AddAttr<bool>("is_reverse",
+                "(bool, defalut: False) "
+                "whether to compute reversed GRU.")
+      .SetDefault(false);
+  AddComment(R"DOC(
+The Fusion complete GRU Operator.
+This operator fuse the fully-connected operator into GRU, 
+more details can refer to GRU op.
+)DOC");
+}
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index_lod, dst, indexed_src);
+}
+template <typename DeviceContext, typename T>
+class FusionGRUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* wx = ctx.Input<Tensor>("WeightX");
+    auto* wh = ctx.Input<Tensor>("WeightH");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* xx = ctx.Output<LoDTensor>("XX");
+    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
+    auto* batch_reset_hidden_prev =
+        ctx.Output<LoDTensor>("BatchResetHiddenPrev");
+    auto* batch_hidden = ctx.Output<LoDTensor>("BatchedHidden");
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
+    batch_reset_hidden_prev->mutable_data<T>(ctx.GetPlace());
+    batch_hidden->mutable_data<T>(ctx.GetPlace());
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+    const T* x_data = x->data<T>();
+    const T* wx_data = wx->data<T>();
+    const T* wh_data = wh->data<T>();
+    auto x_dims = x->dims();
+    auto wx_dims = wx->dims();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    if (x_dims[1] > wx_dims[1]) {
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
+                                        x_data, wx_data, xx_data,
+                                        bias ? bias->data<T>() : NULL);
+      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+    } else {
+      to_batch(dev_ctx, *x, xx, true, is_reverse);
+      batched_gate->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
+                                        xx_data, wx_data, batched_gate_data,
+                                        bias ? bias->data<T>() : NULL);
+    }
+    int frame_size = static_cast<int>(wx_dims[1] / 3);
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(wh_data);
+    gru_value.state_weight =
+        const_cast<T*>(wh_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+    framework::Vector<size_t> order(batched_gate->lod()[2]);
+    if (h0) {
+      ReorderInitState<DeviceContext, T>(
+          ctx.template device_context<DeviceContext>(), *h0, order, &ordered_h0,
+          true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batched_gate->lod()[0];
+    size_t seq_len = batch_starts.size() - 1;
+    auto active_node =
+        math::detail::GetActivationType(ctx.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+#ifdef PADDLE_WITH_MKLML
+    // use MKL packed to speedup GEMM
+    if (FLAGS_paddle_num_threads >= 4) {
+      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
+                                       frame_size * 2 /*width of weight*/,
+                                       frame_size /*height of height*/);
+      PADDLE_ENFORCE(packed_gate);
+      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
+                     frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
+                     packed_gate);
+      T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
+                                        frame_size /*width of weight*/,
+                                        frame_size /*height of height*/);
+      PADDLE_ENFORCE(packed_state);
+      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
+                     frame_size, T(1.0), gru_value.state_weight, frame_size,
+                     packed_state);
+      for (size_t n = 0; n < seq_len; n++) {
+        int bstart = static_cast<int>(batch_starts[n]);
+        int bend = static_cast<int>(batch_starts[n + 1]);
+        int cur_batch_size = bend - bstart;
+        Tensor gate_t = batched_gate->Slice(bstart, bend);
+        Tensor reset_hidden_prev_t =
+            batch_reset_hidden_prev->Slice(bstart, bend);
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        gru_value.output_value = hidden_t.data<T>();
+        gru_value.gate_value = gate_t.data<T>();
+        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+        if (gru_value.prev_out_value) {
+          blas.GEMM_COMPUTE(
+              CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2,
+              frame_size, gru_value.prev_out_value, frame_size, packed_gate,
+              frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
+        }
+        math::detail::forward_reset_output(
+            math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
+            cur_batch_size, active_gate);
+        if (gru_value.prev_out_value) {
+          blas.GEMM_COMPUTE(
+              CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
+              gru_value.reset_output_value, frame_size, packed_state,
+              frame_size, T(1), gru_value.gate_value + frame_size * 2,
+              frame_size * 3);
+        }
+        math::detail::forward_final_output(
+            math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
+            cur_batch_size, active_node);
+        gru_value.prev_out_value = gru_value.output_value;
+      }
+      blas.GEMM_FREE(packed_gate);
+      blas.GEMM_FREE(packed_state);
+    } else {
+#endif
+      for (size_t n = 0; n < seq_len; n++) {
+        int bstart = static_cast<int>(batch_starts[n]);
+        int bend = static_cast<int>(batch_starts[n + 1]);
+        int cur_batch_size = bend - bstart;
+        Tensor gate_t = batched_gate->Slice(bstart, bend);
+        Tensor reset_hidden_prev_t =
+            batch_reset_hidden_prev->Slice(bstart, bend);
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        gru_value.output_value = hidden_t.data<T>();
+        gru_value.gate_value = gate_t.data<T>();
+        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+        math::GRUUnitFunctor<DeviceContext, T>::compute(
+            dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+            active_gate);
+        gru_value.prev_out_value = gru_value.output_value;
+      }
+#ifdef PADDLE_WITH_MKLML
+    }
+#endif
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden->set_lod(batched_gate->lod());
+    to_seq(dev_ctx, *batch_hidden, hidden_out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OP_CPU_KERNEL(
+    fusion_gru, ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/fusion_gru_op.h
+++ b/paddle/fluid/operators/fusion_gru_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+class FusionGRUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+class FusionGRUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -15,10 +15,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/cpu_info.h"
+DEFINE_bool(seq_mode, true, "Use sequence mode");
 namespace paddle {
 namespace operators {
@@ -98,7 +102,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
  ctx->ShareLoD("X", "Hidden");
  ctx->ShareLoD("X", "Cell");
-  int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  int xx_width;
+  if (FLAGS_seq_mode) {
+    xx_width = wx_dims[1];
+  } else {
+    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  }
  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
  ctx->ShareLoD("X", "XX");
 }
@@ -205,10 +214,138 @@ inline void ReorderInitState(const DeviceContext& ctx,
  row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
-template <typename DeviceContext, typename T>
+template <typename T>
 class FuisonLSTMKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void SeqCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+    auto* wx = ctx.Input<Tensor>("WeightX");
+    auto* wh = ctx.Input<Tensor>("WeightH");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* xx = ctx.Output<LoDTensor>("XX");
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
+    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
+    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
+    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
+    if (platform::jit::MayIUse(platform::jit::avx)) {
+      math::VecActivations<T, platform::jit::avx> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_cell = act_functor(act_cell_str);
+      act_cand = act_functor(act_cand_str);
+    } else {
+      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_cell = act_functor(act_cell_str);
+      act_cand = act_functor(act_cand_str);
+    }
+    auto x_lod = x->lod();
+    auto x_dims = x->dims();    // T x M
+    auto wh_dims = wh->dims();  // D x 4D
+    const int total_T = x_dims[0];
+    const int N = x_lod[0].size() - 1;  // batch size
+    const int M = x_dims[1];            // x frame size
+    const int D = wh_dims[0];
+    const int D2 = D * 2;
+    const int D3 = D * 3;
+    const int D4 = wh_dims[1];
+    const T* x_data = x->data<T>();
+    const T* h0_data = h0 ? h0->data<T>() : NULL;
+    const T* c0_data = c0 ? c0->data<T>() : NULL;
+    const T* wx_data = wx->data<T>();
+    const T* wh_data = wh->data<T>();
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
+    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
+                                      xx_data, bias->data<T>());
+    int xx_offset = D4;
+    int gate_offset = D;
+    if (is_reverse) {
+      const int offset = (total_T - 1) * D;
+      xx_data = xx_data + offset * 4;
+      hidden_out_data = hidden_out_data + offset;
+      cell_out_data = cell_out_data + offset;
+      xx_offset = -D4;
+      gate_offset = -D;
+    }
+    auto move_step = [&]() {
+      xx_data = xx_data + xx_offset;
+      hidden_out_data = hidden_out_data + gate_offset;
+      cell_out_data = cell_out_data + gate_offset;
+    };
+    for (int i = 0; i < N; ++i) {
+      int bid = is_reverse ? N - 1 - i : i;
+      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
+      const T* prev_cell_data = NULL;
+      const T* prev_hidden_data = NULL;
+      int tstart = 0;
+      if (h0_data) {
+        prev_hidden_data = h0_data + bid * D;
+        prev_cell_data = c0_data + bid * D;
+      } else {
+        // W_ch, W_ih, W_fh, W_oh
+        act_gate(D3, xx_data + D, xx_data + D);
+        act_cand(D, xx_data, xx_data);
+        // cell out= input*tilde
+        blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
+        // hidden out= act_state(cellout) * outgate
+        act_cell(D, cell_out_data, xx_data + D2);
+        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
+        // prev
+        prev_hidden_data = hidden_out_data;
+        prev_cell_data = cell_out_data;
+        tstart = 1;
+        move_step();
+      }
+      for (int step = tstart; step < seq_len; ++step) {
+        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
+                  prev_hidden_data, D, wh_data, D4, static_cast<T>(1), xx_data,
+                  D4);
+        // W_ch, W_ih, W_fh, W_oh
+        act_gate(D3, xx_data + D, xx_data + D);
+        act_cand(D, xx_data, xx_data);
+        // a = forget * prev_cell
+        blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2);
+        // b = input * tilde
+        blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
+        // cell out= a+b
+        blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
+        // hidden out= act_state(cellout) * outgate
+        act_cell(D, cell_out_data, xx_data + D2);
+        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
+        // prev
+        prev_hidden_data = hidden_out_data;
+        prev_cell_data = cell_out_data;
+        move_step();
+      }
+    }
+  }
+  void BatchCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = platform::CPUDeviceContext;
    auto* x = ctx.Input<LoDTensor>("X");
    auto* wx = ctx.Input<Tensor>("WeightX");
    auto* wh = ctx.Input<Tensor>("WeightH");
@@ -339,6 +476,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    // restore the output cell state in LoDTensor from the batch cell
    to_seq(dev_ctx, batch_cell, cell_out);
  }
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    if (FLAGS_seq_mode) {
+      SeqCompute(ctx);
+    } else {
+      BatchCompute(ctx);
+    }
+  }
 };
 }  // namespace operators
@@ -348,7 +492,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OP_CPU_KERNEL(
+REGISTER_OP_CPU_KERNEL(fusion_lstm, ops::FuisonLSTMKernel<float>,
-    fusion_lstm,
+                       ops::FuisonLSTMKernel<double>);
-    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h"
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/platform/cpu_info.h"
+namespace paddle {
+namespace operators {
+void FusionSeqExpandConcatFCOp::InferShape(
+    framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE_GT(
+      ctx->Inputs("X").size(), 1UL,
+      "Inputs(X) of FusionSeqExpandConcatFCOp should larger than 1.");
+  PADDLE_ENFORCE(
+      ctx->HasInput("FCWeight"),
+      "Input(FCWeight) of FusionSeqExpandConcatFCOp should not be null.");
+  PADDLE_ENFORCE(
+      ctx->HasOutput("Out"),
+      "Output(Out) of FusionSeqExpandConcatFCOp should not be null.");
+  PADDLE_ENFORCE(
+      ctx->HasOutput("FCOut"),
+      "Output(FCOut) of FusionSeqExpandConcatFCOp should not be null.");
+  auto ins_dims = ctx->GetInputsDim("X");
+  auto w_dims = ctx->GetInputDim("FCWeight");  // (M0+M1+M2+..) x D
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2.");
+  const int D = w_dims[1];
+  int sum = ins_dims[0][1];
+  for (size_t i = 1; i < ins_dims.size(); ++i) {
+    sum += ins_dims[i][1];
+  }
+  PADDLE_ENFORCE_EQ(sum, w_dims[0],
+                    "FC height should be sum of all inputs width.");
+  if (ctx->HasInput("FCBias")) {
+    auto b_dims = ctx->GetInputDim("FCBias");
+    PADDLE_ENFORCE(b_dims.size() == 1 || b_dims.size() == 2,
+                   "b_dims should be 1 or 2, get %d", b_dims.size());
+    if (b_dims.size() == 1) {
+      PADDLE_ENFORCE_EQ(b_dims[0], D, "FCBias shapes must be %d.", D);
+    } else {
+      PADDLE_ENFORCE_EQ(b_dims[0], 1, "FCBias shapes must be 1x%d.", D);
+      PADDLE_ENFORCE_EQ(b_dims[1], D, "FCBias shapes must be 1x%d.", D);
+    }
+  }
+  ctx->SetOutputDim("Out", {ins_dims[0][0], D});
+  // fcout should be reshape when run since can not get lod in infershape
+  // explicit share the ref lod
+  ctx->ShareLoD("X", "Out", 0);
+}
+framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.MultiInput<LoDTensor>("X")[0]->type()),
+      ctx.device_context());
+}
+void FusionSeqExpandConcatFCOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) input LodDTensors, the first one must be have ref lod "
+           "for sequence expand, and the rest input should have same lod.")
+      .AsDuplicable();
+  AddInput("FCWeight", "(Tensor) the weights of fc.");
+  AddInput("FCBias", "(Tensor, optional) the bias of fc.").AsDispensable();
+  AddOutput("Out", "(LoDTensor) Output LodTensor.");
+  AddOutput(
+      "FCOut",
+      "(Tensor) the intermediate tensor to keep the result of fc."
+      "Shape is (N x D), where N is the batch size, D is the output dim of fc")
+      .AsIntermediate();
+  AddAttr<std::string>("fc_activation",
+                       "(string, default: identity)"
+                       "The activation for the result of fc."
+                       "`identity` by default.")
+      .SetDefault("identity")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddComment(R"DOC(
+Fusion Sequence expand + concat + fc Operator.
+All below conditions should be meet:
+The ref_level of seq_expand should be 0.
+The ref lod of seq_expand level is the first input of concat.
+The other inputs should have same lod and same batch size of ref lod.
+The seq len of other inputs should be 1.
+The concat axis should be 1.
+)DOC");
+}
+template <typename T>
+class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto ins = ctx.MultiInput<LoDTensor>("X");
+    auto* w = ctx.Input<Tensor>("FCWeight");
+    auto* b = ctx.Input<Tensor>("FCBias");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    auto* fc_out = ctx.Output<Tensor>("FCOut");
+    auto* ref_in = ins[0];
+    auto ref_lod = ref_in->lod();
+    auto in1_lod = ins[1]->lod();
+    auto ref_dims = ref_in->dims();  // T x M0
+    auto in1_dims = ins[1]->dims();  // N x M1
+    auto w_dims = w->dims();
+    const int N = ref_lod[0].size() - 1;
+    const int total_T = ref_dims[0];
+    const int M0 = ref_dims[1];
+    const int M1 = in1_dims[1];
+    const int D = w_dims[1];
+    // some check and fcout should be reshape here
+    // since infershape can not get lod info
+    PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1.");
+    PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1.");
+    PADDLE_ENFORCE_EQ(in1_lod[0].size() - 1, N,
+                      "Batch size of all inputs should be equal.");
+    PADDLE_ENFORCE_EQ(in1_lod[0][N], N,
+                      "Seq_length of other inputs should be 1.");
+    PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size.");
+    for (size_t i = 2; i < ins.size(); ++i) {
+      PADDLE_ENFORCE_EQ(ins[i]->dims()[0], N,
+                        "All other inputs height should be equal");
+      PADDLE_ENFORCE_EQ(ins[i]->lod(), in1_lod,
+                        "All other inputs should have same lod");
+    }
+    fc_out->Resize({N, D});
+    std::function<void(const int, const T*, T*)> fc_act;
+    auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
+    if (platform::jit::MayIUse(platform::jit::avx)) {
+      math::VecActivations<T, platform::jit::avx> act_functor;
+      fc_act = act_functor(fc_act_str);
+    } else {
+      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      fc_act = act_functor(fc_act_str);
+    }
+    const T* ref_in_data = ref_in->data<T>();
+    const T* in1_data = ins[1]->data<T>();
+    const T* w_data = w->data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, D, M0, ref_in_data, w_data,
+                                      out_data, b ? b->data<T>() : NULL);
+    w_data = w_data + M0 * D;
+    // first write on
+    blas.MatMul(N, D, M1, in1_data, w_data, fc_out_data);
+    w_data = w_data + M1 * D;
+    for (size_t i = 2; i < ins.size(); ++i) {
+      // add on
+      const T* in_data = ins[i]->data<T>();
+      const int K = ins[i]->dims()[1];
+      blas.GEMM(CblasNoTrans, CblasNoTrans, N, D, K, static_cast<T>(1), in_data,
+                K, w_data, D, static_cast<T>(1), fc_out_data, D);
+      w_data = w_data + K * D;
+    }
+    T* cur_out_data = out_data;
+    for (int i = 0; i < N; ++i) {
+      int seq_len = ref_lod[0][i + 1] - ref_lod[0][i];
+      T* src = fc_out_data + i * D;
+      for (int step = 0; step < seq_len; ++step) {
+        blas.VADD(D, cur_out_data, src, cur_out_data);
+        cur_out_data = cur_out_data + D;
+      }
+    }
+    fc_act(total_T * D, out_data, out_data);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_seqexpand_concat_fc, ops::FusionSeqExpandConcatFCOp,
+                  ops::FusionSeqExpandConcatFCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OP_CPU_KERNEL(fusion_seqexpand_concat_fc,
+                       ops::FusionSeqExpandConcatFCOpKernel<float>,
+                       ops::FusionSeqExpandConcatFCOpKernel<double>);
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+class FusionSeqExpandConcatFCOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+class FusionSeqExpandConcatFCOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -177,6 +177,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
          dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
          out_row, out_col, output->data<T>());
    }
+    // Wait() must be called because `inputs_data` may be destructed before
+    // kernel ends
+    context.Wait();
  }
 };
@@ -252,6 +255,9 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
          input.data<T>(), in_row, in_col, dev_outs_col_data,
          static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
    }
+    // Wait() must be called because `outputs_data` may be destructed before
+    // kernel ends
+    context.Wait();
  }
 };

--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 #include <cmath>
+#include <functional>
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
 #ifdef __AVX__

--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -41,7 +41,8 @@ template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;    \
+  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -33,10 +33,11 @@ template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
-#define DEFINE_GPU_TRANS(RANK)                                          \
+#define DEFINE_GPU_TRANS(RANK)                                           \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;
+  template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;
 DEFINE_GPU_TRANS(1);
 DEFINE_GPU_TRANS(2);

--- a/paddle/fluid/operators/math/padding.h
+++ b/paddle/fluid/operators/math/padding.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename DeviceContext, typename T, size_t D>
+void PadFunction(const framework::ExecutionContext& context,
+                 const std::vector<int>& pads, const framework::Tensor& src,
+                 T pad_value, framework::Tensor* out) {
+  Eigen::array<std::pair<int, int>, D> paddings;
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    paddings[i].first = pads[i * 2];
+    paddings[i].second = pads[i * 2 + 1];
+  }
+  auto src_tensor = EigenTensor<T, D>::From(src);
+  auto out_tensor = EigenTensor<T, D>::From(*out);
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
+  out_tensor.device(place) = src_tensor.pad(paddings, pad_value);
+}
+template <typename DeviceContext, typename T, size_t D>
+void PadGradFunction(const framework::ExecutionContext& context,
+                     const std::vector<int>& pads, const framework::Tensor& src,
+                     framework::Tensor* d_out) {
+  Eigen::array<std::pair<int, int>, D> paddings;
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    paddings[i].first = -pads[i * 2];
+    paddings[i].second = -pads[i * 2 + 1];
+  }
+  auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
+  auto src_tensor = EigenTensor<T, D>::From(src);
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
+  d_out_tensor.device(place) = src_tensor.pad(paddings, 0);
+}
+template <typename DeviceContext, typename T>
+void PaddingFunctor(int rank, const framework::ExecutionContext& context,
+                    const std::vector<int>& pads, T pad_value,
+                    const framework::Tensor& src, framework::Tensor* out) {
+  switch (rank) {
+    case 1:
+      PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out);
+      break;
+    case 2:
+      PadFunction<DeviceContext, T, 2>(context, pads, src, pad_value, out);
+      break;
+    case 3:
+      PadFunction<DeviceContext, T, 3>(context, pads, src, pad_value, out);
+      break;
+    case 4:
+      PadFunction<DeviceContext, T, 4>(context, pads, src, pad_value, out);
+      break;
+    case 5:
+      PadFunction<DeviceContext, T, 5>(context, pads, src, pad_value, out);
+      break;
+    case 6:
+      PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
+      break;
+    default:
+      PADDLE_THROW(
+          "PadOp only support tensors with no more than 6 dimensions.");
+  }
+}
+template <typename DeviceContext, typename T>
+void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
+                        const std::vector<int>& pads,
+                        const framework::Tensor& src, framework::Tensor* out) {
+  switch (rank) {
+    case 1:
+      PadGradFunction<DeviceContext, T, 1>(context, pads, src, out);
+      break;
+    case 2:
+      PadGradFunction<DeviceContext, T, 2>(context, pads, src, out);
+      break;
+    case 3:
+      PadGradFunction<DeviceContext, T, 3>(context, pads, src, out);
+      break;
+    case 4:
+      PadGradFunction<DeviceContext, T, 4>(context, pads, src, out);
+      break;
+    case 5:
+      PadGradFunction<DeviceContext, T, 5>(context, pads, src, out);
+      break;
+    case 6:
+      PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
+      break;
+    default:
+      PADDLE_THROW(
+          "PadOp only support tensors with no more than 6 dimensions.");
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -38,13 +38,14 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
    auto width = dst_dims[1];
    auto* src_data = src.data<T>();
    auto* dst_data = dst->data<T>();
-    for (int i = 0; i < height; ++i) {
+    const int sz = width * sizeof(T);
-      if (is_src_index) {
+    if (is_src_index) {
-        memcpy(dst_data + i * width, src_data + index[i] * width,
+      for (int i = 0; i < height; ++i) {
-               width * sizeof(T));
+        memcpy(dst_data + i * width, src_data + index[i] * width, sz);
-      } else {
+      }
-        memcpy(dst_data + index[i] * width, src_data + i * width,
+    } else {
-               width * sizeof(T));
+      for (int i = 0; i < height; ++i) {
+        memcpy(dst_data + index[i] * width, src_data + i * width, sz);
      }
    }
  }

--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -18,65 +18,86 @@ namespace paddle {
 namespace operators {
 namespace math {
+template <typename T>
+void CopyValidData(framework::Tensor* dst_tensor,
+                   const framework::Tensor* src_tensor,
+                   const framework::Vector<size_t>& seq_offsets,
+                   int pad_seq_len, int step_width, bool norm_by_len,
+                   CopyType type, PadLayout layout) {
+  int seq_num = seq_offsets.size() - 1;
+  const T* src_data = src_tensor->data<T>();
+  T* dst_data = dst_tensor->data<T>();
+  int seq_cpy_gap = step_width;
+  int pad_cpy_gap =
+      layout == kBatchLengthWidth ? step_width : seq_num * step_width;
+  for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
+    int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
+    PADDLE_ENFORCE_GE(
+        pad_seq_len, valid_seq_len,
+        "The padded sequence length can not be less than its original length.");
+    int seq_data_offset = seq_offsets[seq_idx] * step_width;
+    int pad_data_offset = layout == kBatchLengthWidth
+                              ? seq_idx * pad_seq_len * step_width
+                              : seq_idx * step_width;
+    float scale = 1.0f / static_cast<float>(valid_seq_len);
+    for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) {
+      const T* src =
+          src_data + (type == kSeqToPad ? seq_data_offset : pad_data_offset);
+      T* dst =
+          dst_data + (type == kSeqToPad ? pad_data_offset : seq_data_offset);
+      memcpy(dst, src, step_width * sizeof(T));
+      if (norm_by_len) {
+        for (int i = 0; i < step_width; ++i) {
+          *(dst + i) *= scale;
+        }
+      }
+      seq_data_offset += seq_cpy_gap;
+      pad_data_offset += pad_cpy_gap;
+    }
+  }
+}
 template <typename T>
 class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
 public:
  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& seq, framework::Tensor* padding,
+                  const framework::LoDTensor& seq_tensor,
-                  bool norm_by_times) {
+                  framework::LoDTensor* pad_tensor,
-    auto lod = seq.lod();
+                  const framework::LoDTensor& pad_value, int pad_seq_len = -1,
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                  int lod_level = 0, bool norm_by_times = false,
-                      "The LoD of LoDTensor seq should not be null.");
+                  const PadLayout layout = kBatchLengthWidth) {
+    auto seq_lod = seq_tensor.lod();
-    const size_t level = 0;
+    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+    const auto& seq_tensor_dims = seq_tensor.dims();
+    const auto& pad_tensor_dims = pad_tensor->dims();
-    auto seq_dims = seq.dims();
+    if (pad_seq_len == -1) {
-    PADDLE_ENFORCE_EQ(seq_dims[0],
+      pad_seq_len = MaximumSequenceLength(seq_offsets);
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
+    }
-                      "The first dimension of LoDTensor seq should be "
+    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
-                      "equal to the sum of all sequences's length.");
+    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
-    auto padding_dims = padding->dims();
+              step_width, layout);
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                      "The input padding should be a 3-D Tensor of shape "
+                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                      "[max_sequence_length, num_sequences, sequence_width].");
+                   "'step_width'.");
-    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
+    // fill padding value
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+    T* pad_data = pad_tensor->data<T>();
-                      "The first dimension of Tensor padding should be the "
+    const T* pad_value_data = pad_value.data<T>();
-                      "maximum length of all sequences in LoDTensor seq.");
+    if (pad_value.numel() == 1) {
+      for (int i = 0; i < pad_tensor->numel(); ++i) {
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
+        pad_data[i] = *pad_value_data;
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+      }
-                      "The second dimension of Tensor padding should be the "
+    } else {
-                      "number of sequences in LoDTensor seq.");
+      for (int i = 0; i < pad_tensor->numel(); i += step_width) {
+        memcpy(pad_data + i, pad_value_data, step_width * sizeof(T));
-    const int64_t sequence_width = seq.numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-    const T* seq_data = seq.data<T>();
-    T* padding_data = padding->data<T>();
-    for (int64_t i = 0; i < max_sequence_length; ++i) {
-      for (int64_t j = 0; j < num_sequences; ++j) {
-        int64_t start_pos = abs_offset_lod[level][j];
-        int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
-        if (i < sequence_length) {
-          // i > 0 => sequence_length > 0
-          T scale =
-              norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
-          for (int64_t k = 0; k < sequence_width; ++k) {
-            padding_data[(i * num_sequences + j) * sequence_width + k] =
-                seq_data[(start_pos + i) * sequence_width + k] * scale;
-          }
-        } else {
-          memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
-                 sequence_width * sizeof(T));
-        }
      }
    }
+    CopyValidData<T>(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len,
+                     step_width, norm_by_times, kSeqToPad, layout);
  }
 };
@@ -84,62 +105,35 @@ template <typename T>
 class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
 public:
  void operator()(const platform::CPUDeviceContext& context,
-                  framework::LoDTensor* seq, const framework::Tensor& padding,
+                  const framework::LoDTensor& pad_tensor,
-                  bool norm_by_times) {
+                  framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
-    auto lod = seq->lod();
+                  int lod_level = 0, bool norm_by_times = false,
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                  const PadLayout layout = kBatchLengthWidth) {
-                      "The LoD of LoDTensor seq should not be null.");
+    auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
+    const auto& seq_tensor_dims = seq_tensor->dims();
-    const size_t level = 0;
+    const auto& pad_tensor_dims = pad_tensor.dims();
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+    if (pad_seq_len == -1) {
+      pad_seq_len = MaximumSequenceLength(seq_offsets);
-    auto seq_dims = seq->dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0],
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
-                      "The first dimension of LoDTensor seq should be "
-                      "equal to the sum of all sequences's length.");
-    auto padding_dims = padding.dims();
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
-                      "The input padding should be a 3-D Tensor of shape "
-                      "[max_sequnece_length, num_sequences, sequence_width].");
-    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
-                      "The first dimension of Tensor padding should be "
-                      "the maximum length of all sequences in LoDTensor seq.");
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
-                      "The second dimension of Tensor padding should be "
-                      "the number of sequences in LoDTensor seq.");
-    const int64_t sequence_width = seq->numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-    const T* padding_data = padding.data<T>();
-    T* seq_data = seq->data<T>();
-    for (int64_t i = 0; i < num_sequences; ++i) {
-      int64_t start_pos = abs_offset_lod[level][i];
-      int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
-      for (int64_t j = 0; j < sequence_length; ++j) {
-        // sequence_width > j > 0
-        T scale =
-            norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
-        for (int64_t k = 0; k < sequence_width; ++k) {
-          seq_data[(start_pos + j) * sequence_width + k] =
-              padding_data[(j * num_sequences + i) * sequence_width + k] *
-              scale;
-        }
-      }
    }
+    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
+    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
+              step_width, layout);
+    CopyValidData<T>(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len,
+                     step_width, norm_by_times, kPadToSeq, layout);
  }
 };
+template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
+template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
 template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
+template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
+template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
 template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -19,41 +19,32 @@ namespace paddle {
 namespace operators {
 namespace math {
-template <typename T, bool NormByTimes, bool Padding>
+template <typename T, CopyType Type>
-__global__ void SequencePaddingKernel(T* padding, T* sequence,
+__global__ void SequencePaddingKernel(
-                                      const size_t* sequence_start_positions,
+    T* dst, const T* src, const T* pad_value, bool is_constant_pad,
-                                      const size_t sequence_width,
+    const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len,
-                                      const size_t max_sequence_length,
+    const size_t step_width, bool norm_by_len, const PadLayout layout) {
-                                      const size_t num_sequences) {
+  size_t seq_idx = blockIdx.y;
-  size_t padding_idx = blockIdx.y;
+  size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
-  size_t start_pos = sequence_start_positions[padding_idx];
-  size_t sequence_length =
+  size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y;
-      sequence_start_positions[padding_idx + 1] - start_pos;
+  size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width;
+  size_t pad_data_offset = layout == kBatchLengthWidth
-  size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y;
+                               ? (seq_idx * pad_seq_len + step_idx) * step_width
-  size_t padding_base_idx =
+                               : (step_idx * seq_num + seq_idx) * step_width;
-      (sequence_idx * num_sequences + padding_idx) * sequence_width;
-  size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width;
+  T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset);
+  const T* src_data =
-  if (sequence_idx < sequence_length) {
+      src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
-    T scale = NormByTimes ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
-    if (Padding) {
+  if (step_idx < seq_len) {
-      /* sequence -> padding */
+    float scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
-      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
-        padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i];
+      dst_data[i] = scale * src_data[i];
-      }
-    } else {
-      /* padding -> sequence */
-      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
-        sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i];
-      }
    }
-  } else if (sequence_idx < max_sequence_length) {
+  } else if (step_idx < pad_seq_len && Type == kSeqToPad) {
-    if (Padding) {
+    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
-      /* sequence -> padding */
+      dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i];
-      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
-        padding[padding_base_idx + i] = 0;
-      }
    }
  }
 }
@@ -62,74 +53,59 @@ template <typename T>
 class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& seq, framework::Tensor* padding,
+                  const framework::LoDTensor& seq_tensor,
-                  bool norm_by_times) {
+                  framework::LoDTensor* pad_tensor,
-    auto lod = seq.lod();
+                  const framework::LoDTensor& pad_value, int pad_seq_len = -1,
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                  int lod_level = 0, bool norm_by_times = false,
-                      "The lod of LoDTensor seq should not be null.");
+                  const PadLayout layout = kBatchLengthWidth) {
+    auto seq_lod = seq_tensor.lod();
-    const size_t level = 0;
+    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+    const auto& seq_tensor_dims = seq_tensor.dims();
+    const auto& pad_tensor_dims = pad_tensor->dims();
-    auto seq_dims = seq.dims();
+    int max_seq_len = MaximumSequenceLength(seq_offsets);
-    PADDLE_ENFORCE_EQ(seq_dims[0],
+    if (pad_seq_len == -1) {
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
+      pad_seq_len = max_seq_len;
-                      "The first dimension of LoDTensor seq should be "
+    }
-                      "equal to the sum of all sequences's length.");
+    PADDLE_ENFORCE_GE(pad_seq_len, max_seq_len,
+                      "The pad_seq_len must be equal to or greater than the "
-    auto padding_dims = padding->dims();
+                      "original max sequence length.");
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
-                      "The input padding should be a 3-D Tensor of shape "
+    int seq_num = seq_offsets.size() - 1;
-                      "[max_sequence_length, num_sequences, sequence_width].");
+    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
-    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
+              step_width, layout);
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                      "The first dimension of Tensor padding should be the "
+                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                      "maximum length of all sequences in LoDTensor seq.");
+                   "'step_width'.");
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
+    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+      TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor);
-                      "The second dimension of Tensor padding should be the "
+      pad_tensor->Resize(pad_tensor_dims);
-                      "number of sequences in LoDTensor seq.");
-    const int64_t sequence_width = seq.numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-    if (!norm_by_times && num_sequences == 1UL) {
-      TensorCopy(seq, context.GetPlace(), context, padding);
-      padding->Resize(padding_dims);
      return;
    }
-    const int64_t kBlockSize = 512;
+    const int kBlockSize = 512;
    /* At least use 32 threads to copy sequence_width elements,
     * and at least 8 elements for each thread.
     */
    size_t block_dim_x =
-        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+        std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
    size_t block_dim_y = kBlockSize / block_dim_x;
    dim3 threads(block_dim_x, block_dim_y);
-    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = num_sequences;
+    size_t grid_dim_y = seq_num;
    dim3 grid(grid_dim_x, grid_dim_y);
-    const T* seq_data = seq.data<T>();
+    const T* seq_data = seq_tensor.data<T>();
-    T* padding_data = padding->data<T>();
+    T* pad_data = pad_tensor->data<T>();
-    if (norm_by_times) {
+    const T* pad_value_data = pad_value.data<T>();
-      SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data),
+    SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
-          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
+        pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
-          max_sequence_length, num_sequences);
+        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
-    } else {
+        step_width, norm_by_times, layout);
-      SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data),
-          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
-          max_sequence_length, num_sequences);
-    }
  }
 };
@@ -137,79 +113,62 @@ template <typename T>
 class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
-                  framework::LoDTensor* seq, const framework::Tensor& padding,
+                  const framework::LoDTensor& pad_tensor,
-                  bool norm_by_times) {
+                  framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
-    auto lod = seq->lod();
+                  int lod_level = 0, bool norm_by_times = false,
-    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                  const PadLayout layout = kBatchLengthWidth) {
-                      "The lod of LoDTensor seq should not be null.");
+    auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
+    const auto& seq_tensor_dims = seq_tensor->dims();
-    const size_t level = 0;
+    const auto& pad_tensor_dims = pad_tensor.dims();
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+    int max_seq_len = MaximumSequenceLength(seq_offsets);
+    if (pad_seq_len == -1) {
-    auto seq_dims = seq->dims();
+      pad_seq_len = max_seq_len;
-    PADDLE_ENFORCE_EQ(seq_dims[0],
+    }
-                      static_cast<int64_t>(abs_offset_lod[level].back()),
+    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
-                      "The first dimension of LoDTensor seq should be "
+    int seq_num = seq_offsets.size() - 1;
-                      "equal to the sum of all sequences's length.");
+    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
-    auto padding_dims = padding.dims();
+              step_width, layout);
-    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
-                      "The input padding should be a 3-D Tensor of shape "
+    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
-                      "[max_sequnece_length, num_sequences, sequence_width].");
+      TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor);
+      seq_tensor->Resize(seq_tensor_dims);
-    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
-    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
-                      "The first dimension of Tensor padding should be "
-                      "the maximum length of all sequences in LoDTensor seq.");
-    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
-    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
-                      "The second dimension of Tensor padding should be "
-                      "the number of sequences in LoDTensor seq.");
-    const int64_t sequence_width = seq->numel() / seq_dims[0];
-    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
-                      "The third dimension of Tensor padding should be the "
-                      "width of sequence in LoDTensor seq.");
-    if (!norm_by_times && num_sequences == 1UL) {
-      TensorCopy(padding, context.GetPlace(), context, seq);
-      seq->Resize(seq_dims);
      return;
    }
-    const int64_t kBlockSize = 512;
+    const int kBlockSize = 512;
    /* At least use 32 threads to copy sequence_width elements,
     * and at least 8 elements for each thread.
     */
    size_t block_dim_x =
-        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+        std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
    size_t block_dim_y = kBlockSize / block_dim_x;
    dim3 threads(block_dim_x, block_dim_y);
-    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
-    size_t grid_dim_y = num_sequences;
+    size_t grid_dim_y = seq_num;
    dim3 grid(grid_dim_x, grid_dim_y);
-    const T* padding_data = padding.data<T>();
+    const T* pad_data = pad_tensor.data<T>();
-    T* seq_data = seq->data<T>();
+    T* seq_data = seq_tensor->data<T>();
-    if (norm_by_times) {
-      SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
+    SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data,
+        seq_data, pad_data, nullptr, false,
-          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
+        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
-          max_sequence_length, num_sequences);
+        step_width, norm_by_times, layout);
-    } else {
-      SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data,
-          abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
-          max_sequence_length, num_sequences);
-    }
  }
 };
+template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
+template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
 template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
+template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int>;
+template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, int64_t>;
 template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, double>;
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -22,17 +23,33 @@ namespace paddle {
 namespace operators {
 namespace math {
-inline static size_t MaximumSequenceLength(const framework::LoD& lod,
+enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
-                                           const size_t level) {
-  const size_t num_sequences = lod[level].size() - 1;
+enum CopyType { kSeqToPad, kPadToSeq };
-  size_t max_sequence_length = 0;
-  framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+inline static size_t MaximumSequenceLength(
-  for (size_t i = 0; i < num_sequences; ++i) {
+    const framework::Vector<size_t>& seq_offset) {
-    max_sequence_length =
+  size_t seq_num = seq_offset.size() - 1;
-        std::max(max_sequence_length,
+  size_t max_seq_len = 0;
-                 abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]);
+  for (size_t i = 0; i < seq_num; ++i) {
+    max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
  }
-  return max_sequence_length;
+  return max_seq_len;
+}
+inline static void CheckDims(const framework::DDim& seq_tensor_dims,
+                             const framework::DDim& pad_tensor_dims,
+                             const framework::Vector<size_t>& seq_offset,
+                             int64_t padded_seq_len, int64_t step_width,
+                             const PadLayout& layout) {
+  PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
+                    "Value of 1st dimension of the sequence tensor should be "
+                    "equal to sum of lengths of all sequences.");
+  PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
+                     seq_tensor_dims.size() == pad_tensor_dims.size(),
+                 "pad_tensor's rank should be 1 greater than seq_tensor's "
+                 "rank, or be equal with it.");
 }
 /*
@@ -64,15 +81,22 @@ inline static size_t MaximumSequenceLength(const framework::LoD& lod,
 template <typename DeviceContext, typename T>
 class PaddingLoDTensorFunctor {
 public:
-  void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
+  void operator()(const DeviceContext& context,
-                  framework::Tensor* padding, bool norm_by_times);
+                  const framework::LoDTensor& seq_tensor,
+                  framework::LoDTensor* pad_tensor,
+                  const framework::LoDTensor& pad_value, int pad_seq_len = -1,
+                  int lod_level = 0, bool norm_by_times = false,
+                  const PadLayout layout = kBatchLengthWidth);
 };
 template <typename DeviceContext, typename T>
 class UnpaddingLoDTensorFunctor {
 public:
-  void operator()(const DeviceContext& context, framework::LoDTensor* seq,
+  void operator()(const DeviceContext& context,
-                  const framework::Tensor& padding, bool norm_by_times);
+                  const framework::LoDTensor& pad_tensor,
+                  framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
+                  int lod_level = 0, bool norm_by_times = false,
+                  const PadLayout layout = kBatchLengthWidth);
 };
 }  // namespace math

--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -23,7 +23,9 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
  paddle::framework::LoDTensor cpu_seq_back;
  paddle::framework::LoDTensor seq;
  paddle::framework::LoDTensor seq_back;
-  paddle::framework::Tensor padding;
+  paddle::framework::LoDTensor padding;
+  paddle::framework::LoDTensor cpu_pad_value;
+  paddle::framework::LoDTensor pad_value;
  const size_t level = lod.size() - 1;
  auto seq_dims =
@@ -46,20 +48,33 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
  }
  const size_t max_sequence_length =
-      paddle::operators::math::MaximumSequenceLength(lod, level);
+      paddle::operators::math::MaximumSequenceLength(lod[level]);
  const size_t num_sequences = lod[level].size() - 1;
  auto padding_dims =
      paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length),
                                    static_cast<int64_t>(num_sequences),
                                    static_cast<int64_t>(sequence_width)});
  padding.mutable_data<T>(padding_dims, *place);
+  T* pad_value_data =
+      cpu_pad_value.mutable_data<T>({1}, paddle::platform::CPUPlace());
+  *pad_value_data = static_cast<T>(0);
+  if (paddle::platform::is_cpu_place(*place)) {
+    pad_value = cpu_pad_value;
+  } else {
+    TensorCopySync(cpu_pad_value, *place, &pad_value);
+  }
  paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-      *context, seq, &padding, false);
+      *context, seq, &padding, pad_value, -1, 0, false,
+      paddle::operators::math::kLengthBatchWidth);
  seq_back.set_lod(lod);
  seq_back.mutable_data<T>(seq_dims, *place);
  paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-      *context, &seq_back, padding, false);
+      *context, padding, &seq_back, -1, 0, false,
+      paddle::operators::math::kLengthBatchWidth);
  if (paddle::platform::is_cpu_place(*place)) {
    cpu_seq_back = seq_back;

--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/pad_constant_like_op.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+class PadConstantLikeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of PadConstantLikeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of PadConstantLikeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PadConstantLikeOp should not be null.");
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(x_dim.size(), y_dim.size(),
+                      "The dimention of X and Y should be the same.");
+    for (int i = 0; i < x_dim.size(); ++i) {
+      PADDLE_ENFORCE_GE(x_dim[i], y_dim[i]);
+    }
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
+        ctx.device_context());
+  }
+};
+class PadConstantLikeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input of pad_constant_like op. "
+             "The input should be a k-D tensor(k > 0 and k < 7)");
+    AddInput("Y",
+             "The input of pad_constant_like op. "
+             "The input should be a k-D tensor(k > 0 and k < 7)");
+    AddOutput("Out",
+              "The output of pad_constant_like op. "
+              "A tensor with the same shape as X.");
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+PadConstantLikeOp Operator.
+Pad input(Y) with a pad_value, the number of values padded to the edges of each
+axis is specified by the difference of the shape of X and Y.
+((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for
+each axis.
+The input should be a k-D tensor(k > 0 and k < 7). As an example:
+case1:
+    Given:
+        X = [[1, 2],
+             [3, 4],
+             [1, 2],
+             [3, 4]]],
+        X.shape = (4, 2)
+        Y = [[5, 6],
+            [7, 8]],
+        Y.shape = (2, 2)
+    And
+        pad_value = 0,
+    Return:
+        Out = [[5, 6],
+               [7, 8],
+               [0, 0],
+               [0, 0]]
+        Out.shape = (4, 2)
+case2:
+    Given:
+        X = [[[[ 0,  1,  2],
+               [ 3,  4,  5]],
+              [[ 6,  7,  8],
+               [ 9, 10, 11]],
+              [[12, 13, 14],
+               [15, 16, 17]]],
+             [[[18, 19, 20],
+               [21, 22, 23]],
+              [[24, 25, 26],
+               [27, 28, 29]],
+              [[30, 31, 32],
+               [33, 34, 35]]]]
+        X.shape = (2, 3, 2, 3)
+        Y = [[[[35, 36, 37]],
+              [[38, 39, 40]],
+              [[41, 42, 43]]]]
+        Y.shape = (1, 3, 1, 3)
+    And
+        pad_value = -1,
+    Return:
+        Out = [[[[35, 36, 37],
+                 [-1, -1, -1]],
+                [[38, 39, 40],
+                 [-1, -1, -1]],
+                [[41, 42, 43],
+                 [-1, -1, -1]]],
+               [[[-1, -1, -1],
+                 [-1, -1, -1]],
+                [[-1, -1, -1],
+                 [-1, -1, -1]],
+                [[-1, -1, -1],
+                 [-1, -1, -1]]]]
+        Out.shape = (2, 3, 2, 3)
+)DOC");
+  }
+};
+class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto y_dim = ctx->GetInputDim("Y");
+    auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
+    PADDLE_ENFORCE_EQ(dout_dim.size(), y_dim.size(),
+                      "The dimention of X and Y should be the same.");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dim);
+      ctx->ShareLoD("Y", /*->*/ y_grad_name);
+      for (int i = 0; i < y_dim.size(); ++i) {
+        PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i]);
+      }
+    }
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
+        ctx.device_context());
+  }
+};
+class PadConstantLikeOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *bind = new framework::OpDesc();
+    bind->SetType("pad_constant_like_grad");
+    bind->SetInput("Y", Input("Y"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    bind->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(bind);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(pad_constant_like, ops::PadConstantLikeOp,
+                  ops::PadConstantLikeOpMaker, ops::PadConstantLikeOpGradMaker);
+REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    pad_constant_like,
+    ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pad_constant_like_grad,
+    ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/pad_constant_like_op.cu
+++ b/paddle/fluid/operators/pad_constant_like_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/pad_constant_like_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    pad_constant_like,
+    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pad_constant_like_grad,
+    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext,
+                                   double>);
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/padding.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class PadConstantLikeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto in_x = context.Input<framework::Tensor>("X");
+    auto in_y = context.Input<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
+    if (in_x->dims() == in_y->dims()) {
+      // TensorCopy(in_y, context.GetPlace(), context, out);
+      out->ShareDataWith(*in_y);
+      return;
+    }
+    T pad_value = context.Attr<T>("pad_value");
+    out->mutable_data<T>(context.GetPlace());
+    int rank = context.Input<framework::Tensor>("X")->dims().size();
+    std::vector<int> pads(rank * 2, 0);
+    for (int j = 0; j < rank; ++j) {
+      pads[j * 2] = 0;
+      pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
+    }
+    math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value,
+                                           *in_y, out);
+  }
+};
+template <typename DeviceContext, typename T>
+class PadConstantLikeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto in_y = context.Input<framework::Tensor>("Y");
+    auto in_dout =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_y = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+    if (d_y == nullptr) {
+      return;
+    }
+    if (in_dout->dims() == in_y->dims()) {
+      // TensorCopy(in_dout, context.GetPlace(), context, d_y);
+      d_y->ShareDataWith(*in_dout);
+      return;
+    }
+    d_y->mutable_data<T>(context.GetPlace());
+    int rank = in_dout->dims().size();
+    std::vector<int> pads(static_cast<size_t>(rank) * 2, 0);
+    for (int j = 0; j < rank; ++j) {
+      pads[j * 2] = 0;
+      pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
+    }
+    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout,
+                                               d_y);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
@@ -18,117 +18,44 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/padding.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename DeviceContext, typename T, size_t D>
-void PadFunction(const framework::ExecutionContext& context) {
-  auto pads = context.Attr<std::vector<int>>("paddings");
-  Eigen::array<std::pair<int, int>, D> paddings;
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    paddings[i].first = pads[i * 2];
-    paddings[i].second = pads[i * 2 + 1];
-  }
-  T pad_value = context.Attr<T>("pad_value");
-  auto* x = context.Input<Tensor>("X");
-  auto* out = context.Output<Tensor>("Out");
-  out->mutable_data<T>(context.GetPlace());
-  auto x_tensor = EigenTensor<T, D>::From(*x);
-  auto out_tensor = EigenTensor<T, D>::From(*out);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  out_tensor.device(place) = x_tensor.pad(paddings, pad_value);
-}
 template <typename DeviceContext, typename T>
 class PadKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
+    auto pads = context.Attr<std::vector<int>>("paddings");
-    switch (rank) {
+    T pad_value = context.Attr<T>("pad_value");
-      case 1:
+    auto* x = context.Input<Tensor>("X");
-        PadFunction<DeviceContext, T, 1>(context);
+    auto* out = context.Output<Tensor>("Out");
-        break;
+    out->mutable_data<T>(context.GetPlace());
-      case 2:
-        PadFunction<DeviceContext, T, 2>(context);
+    int rank = x->dims().size();
-        break;
+    math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value, *x,
-      case 3:
+                                           out);
-        PadFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        PadFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        PadFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        PadFunction<DeviceContext, T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "PadOp only support tensors with no more than 6 dimensions.");
-    }
  }
 };
-template <typename DeviceContext, typename T, size_t D>
-void PadGradFunction(const framework::ExecutionContext& context) {
-  auto pads = context.Attr<std::vector<int>>("paddings");
-  Eigen::array<std::pair<int, int>, D> paddings;
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    paddings[i].first = -pads[i * 2];
-    paddings[i].second = -pads[i * 2 + 1];
-  }
-  auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-  if (d_x != nullptr) {
-    d_x->mutable_data<T>(context.GetPlace());
-    auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
-    auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
-  }
-}
 template <typename DeviceContext, typename T>
 class PadGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    size_t rank =
+    auto pads = context.Attr<std::vector<int>>("paddings");
-        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    switch (rank) {
+    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-      case 1:
+    if (d_x == nullptr) {
-        PadGradFunction<DeviceContext, T, 1>(context);
+      return;
-        break;
-      case 2:
-        PadGradFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        PadGradFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        PadGradFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        PadGradFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        PadGradFunction<DeviceContext, T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "PadOp only support tensors with no more than 6 dimensions.");
    }
+    d_x->mutable_data<T>(context.GetPlace());
+    int rank = d_out->dims().size();
+    math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
+                                               d_x);
  }
 };

--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -355,6 +355,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
        grad->SetInput(framework::GradVarName(output_param), og_names);
      }
    }
+    grad->SetInput("Communicator", {"nccl_com__do_not_change_"});
    grad->SetAttrMap(this->Attrs());
    grad->SetBlockAttr(kParallelBlock, grad_block_[0]);

--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -13,14 +13,12 @@
   limitations under the License. */
 #include <algorithm>
-#include <ctime>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/framework/variable.h"
 namespace paddle {
 namespace operators {
+using framework::GradVarName;
 #define CLOG std::cout
@@ -35,7 +33,7 @@ struct Formater {
  std::type_index dtype{typeid(const char)};
  framework::LoD lod;
  int summarize;
-  void* data{nullptr};
+  void *data{nullptr};
  void operator()(size_t size) {
    PrintMessage();
@@ -101,7 +99,7 @@ struct Formater {
  template <typename T>
  void Display(size_t size) {
-    auto* d = reinterpret_cast<T*>(data);
+    auto *d = reinterpret_cast<T *>(data);
    CLOG << "\tdata: ";
    if (summarize != -1) {
      summarize = std::min(size, (size_t)summarize);
@@ -120,51 +118,36 @@ struct Formater {
 // TODO(ChunweiYan) there should be some other printers for TensorArray
 class TensorPrintOp : public framework::OperatorBase {
 public:
-  TensorPrintOp(const std::string& type,
+  TensorPrintOp(const std::string &type,
-                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap& outputs,
+                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap& attrs)
+                const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
-  TensorPrintOp(const TensorPrintOp& o)
+  TensorPrintOp(const TensorPrintOp &o)
      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
+            static_cast<const framework::OperatorBase &>(o)) {
    PADDLE_THROW("Not implemented.");
  }
 private:
-  void RunImpl(const framework::Scope& scope,
+  void RunImpl(const framework::Scope &scope,
-               const platform::Place& place) const override {
+               const platform::Place &place) const override {
-    const framework::Variable* in_var_ptr = nullptr;
+    const framework::Variable *in_var_ptr = nullptr;
-    std::string phase(kForward);
    std::string printed_var_name = "";
-    auto& inputs = Inputs();
+    in_var_ptr = scope.FindVar(Input("In"));
-    if (inputs.find("In") != inputs.end() && !Inputs("In").empty()) {
+    printed_var_name = Inputs("In").front();
-      in_var_ptr = scope.FindVar(Input("In"));
-      printed_var_name = Inputs("In").front();
-    } else if (inputs.find("In@GRAD") != inputs.end() &&
-               !Inputs("In@GRAD").empty()) {
-      in_var_ptr = scope.FindVar(Input("In@GRAD"));
-      printed_var_name = Inputs("In@GRAD").front();
-      phase = std::string(kBackward);
-    } else {
-      PADDLE_THROW("Unknown phase, should be forward or backward.");
-    }
    PADDLE_ENFORCE_NOT_NULL(in_var_ptr);
-    auto& in_tensor = in_var_ptr->Get<framework::LoDTensor>();
+    auto &in_tensor = in_var_ptr->Get<framework::LoDTensor>();
-    auto* out_var_ptr = scope.FindVar(Output("Out"));
-    auto& out_tensor = *out_var_ptr->GetMutable<framework::LoDTensor>();
-    // Just copy data from input tensor to output tensor
-    // output tensor share same memory with input tensor
-    out_tensor.ShareDataWith(in_tensor);
-    out_tensor.set_lod(in_tensor.lod());
    std::string print_phase = Attr<std::string>("print_phase");
-    if (print_phase != phase && print_phase != std::string(kBoth)) {
+    bool is_forward = Attr<bool>("is_forward");
+    if ((is_forward && print_phase == kBackward) ||
+        (!is_forward && print_phase == kForward)) {
      return;
    }
@@ -192,7 +175,7 @@ class TensorPrintOp : public framework::OperatorBase {
      formater.dtype = printed_tensor.type();
    }
    if (Attr<bool>("print_tensor_shape")) {
-      auto& dims = printed_tensor.dims();
+      auto &dims = printed_tensor.dims();
      formater.dims.resize(dims.size());
      for (int i = 0; i < dims.size(); ++i) formater.dims[i] = dims[i];
    }
@@ -200,7 +183,7 @@ class TensorPrintOp : public framework::OperatorBase {
      formater.lod = printed_tensor.lod();
    }
    formater.summarize = Attr<int>("summarize");
-    formater.data = reinterpret_cast<void*>(printed_tensor.data<void>());
+    formater.data = reinterpret_cast<void *>(printed_tensor.data<void>());
    formater(printed_tensor.numel());
  }
@@ -219,14 +202,14 @@ class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<bool>("print_tensor_type", "Whether to print the tensor's dtype.");
    AddAttr<bool>("print_tensor_shape", "Whether to print the tensor's shape.");
    AddAttr<bool>("print_tensor_lod", "Whether to print the tensor's lod.");
-    AddAttr<std::string>(
+    AddAttr<std::string>("print_phase",
-        "print_phase",
+                         "(string, default 'FORWARD') Which phase to display "
-        "(string, default 'BOTH') Which phase to display including 'FORWARD' "
+                         "including 'FORWARD' "
-        "'BACKWARD' and 'BOTH'.")
+                         "'BACKWARD' and 'BOTH'.")
        .SetDefault(std::string(kBoth))
        .InEnum({std::string(kForward), std::string(kBackward),
                 std::string(kBoth)});
-    AddOutput("Out", "Output tensor with same data as input tensor.");
+    AddAttr<bool>("is_forward", "Whether is forward or not").SetDefault(true);
    AddComment(R"DOC(
 Creates a print op that will print when a tensor is accessed.
@@ -238,40 +221,21 @@ tensor `t`.)DOC");
 class InferShapeForward : public framework::InferShapeBase {
 public:
-  void operator()(framework::InferShapeContext* context) const override {
+  void operator()(framework::InferShapeContext *context) const override {
    PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null.");
-    context->ShareLoD("In", /*->*/ "Out");
-    context->SetOutputDim("Out", context->GetInputDim("In"));
-  }
-};
-class InferShapeBackward : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* context) const override {
-    PADDLE_ENFORCE(context->HasInput("In@GRAD"),
-                   "Input(In@GRAD) should not be null.");
-    context->ShareLoD("In@GRAD", /*->*/ "Out");
-    context->SetOutputDim("Out", context->GetInputDim("In@GRAD"));
  }
 };
-class InferVarType : public framework::VarTypeInference {
+class PrintOpGradientMaker : public framework::SingleGradOpDescMaker {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {}
-};
-class PrintOpProtoAndCheckGradOpMaker
-    : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* op_desc_ptr = new framework::OpDesc();
+    auto *op_desc_ptr = new framework::OpDesc();
-    op_desc_ptr->SetType("print_grad");
+    op_desc_ptr->SetType("print");
-    op_desc_ptr->SetInput("In@GRAD", OutputGrad("Out"));
+    op_desc_ptr->SetInput("In", InputGrad("In"));
-    op_desc_ptr->SetOutput("Out", InputGrad("In"));
    op_desc_ptr->SetAttrMap(Attrs());
+    op_desc_ptr->SetAttr("is_forward", false);
    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
  }
 };
@@ -282,6 +246,4 @@ class PrintOpProtoAndCheckGradOpMaker
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker,
-                  ops::PrintOpProtoAndCheckGradOpMaker, ops::InferShapeForward,
+                  ops::PrintOpGradientMaker, ops::InferShapeForward);
-                  ops::InferVarType);
-REGISTER_OPERATOR(print_grad, ops::TensorPrintOp, ops::InferShapeBackward);
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/scale_op.h"
 #include <string>
+#include "paddle/fluid/operators/detail/safe_ref.h"
 namespace paddle {
 namespace operators {
@@ -52,6 +55,21 @@ $$Out = scale*X$$
  }
 };
+class ScaleOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto &in_var_name = op_desc.Input("X").front();
+    auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name));
+    auto out_var_name = op_desc.Output("Out").front();
+    auto *out_var = block->FindVarRecursive(out_var_name);
+    out_var->SetType(in_var.GetType());
+    out_var->SetDataType(in_var.GetDataType());
+  }
+};
 class ScaleGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
@@ -71,7 +89,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker);
+REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker,
+                  ops::ScaleOpVarTypeInference);
 REGISTER_OP_CPU_KERNEL(
    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,

--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -22,17 +22,29 @@ namespace operators {
 template <typename DeviceContext, typename T>
 class ScaleKernel : public framework::OpKernel<T> {
 public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in_var = ctx.InputVar("X");
-    auto* in = context.Input<framework::Tensor>("X");
+    auto* in = ctx.Input<framework::Tensor>("X");
-    tensor->mutable_data<T>(in->place());
-    auto scale = static_cast<T>(context.Attr<float>("scale"));
+    auto* out_var = ctx.OutputVar("Out");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(in->place());
-    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
+                      "in and out should have the same dim");
+    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
+    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
+      auto& in_slr = in_var->Get<framework::SelectedRows>();
+      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
+      out_slr->set_rows(in_slr.rows());
+      out_slr->set_height(in_slr.height());
+    }
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev =
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-        *context.template device_context<DeviceContext>().eigen_device();
    eigen_out.device(dev) = scale * eigen_in;
  }
 };

--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -56,6 +56,10 @@ class SendBarrierOp : public framework::OperatorBase {
 class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() {
+    AddInput("X", "(Any) Dummy inputs, used for control dependency")
+        .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
    AddComment(R"DOC(
 SendBarrier operator

--- a/paddle/fluid/operators/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_pad_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/sequence_pad_op.h"
+namespace paddle {
+namespace operators {
+class SequencePadOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequencePadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PadValue"),
+                   "Input(PadValue) of SequencePadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequencePadOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "The rank of Input(x) can't be less than 2.");
+    auto time_step_dims = framework::slice_ddim(x_dims, 1, x_dims.size());
+    auto pad_value_dims = ctx->GetInputDim("PadValue");
+    PADDLE_ENFORCE(pad_value_dims == framework::make_ddim({1}) ||
+                       pad_value_dims == time_step_dims,
+                   "The Input(PadValue) must be a scalar or a tensor whose "
+                   "shape equals to time steps in sequences");
+    int out_dim_0 = -1;
+    int out_dim_1 = -1;
+    if (ctx->IsRuntime()) {
+      // run time
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      const auto& x_lod = x_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+      const auto& x_lod_0 = x_lod[0];
+      PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
+                        "The Input(X)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], static_cast<int64_t>(x_lod_0.back()),
+          "The Input(X)'s lod info mismatches the actual tensor shape.");
+      int seq_num = x_lod_0.size() - 1;
+      int max_seq_len = math::MaximumSequenceLength(x_lod_0);
+      int padded_length = ctx->Attrs().Get<int>("padded_length");
+      if (padded_length == -1) {
+        padded_length = max_seq_len;
+      }
+      PADDLE_ENFORCE_GE(padded_length, max_seq_len,
+                        "The Attr(padded_length) must be -1 or an int greater "
+                        "than the length of the longest original sequence.");
+      out_dim_0 = seq_num;
+      out_dim_1 = padded_length;
+    } else {
+      // compile time
+      framework::VarDesc* x_desc =
+          boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
+      PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
+    }
+    std::vector<int> out_dims_vec{out_dim_0, out_dim_1};
+    auto time_step_dims_vec = framework::vectorize2int(time_step_dims);
+    out_dims_vec.insert(out_dims_vec.end(), time_step_dims_vec.begin(),
+                        time_step_dims_vec.end());
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
+  }
+};
+class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+    AddInput("PadValue",
+             "(LoDTensor), this Tensor holds values that will be fill into "
+             "padded steps. It can be a scalar or a tensor whose shape equals "
+             "to time steps in sequences. If it's a scalar, it will be "
+             "automatically broadcasted to the shape of time step.");
+    AddOutput(
+        "Out",
+        "(LoDTensor) The output vairable, which contains padded sequences.");
+    AddAttr<int>(
+        "padded_length",
+        "The length of padded sequences. It can be setted to -1 or "
+        "any positive int. When it is -1, all sequences will be padded up to "
+        "the length of the longest one among them; when it a certain positive "
+        "value, it must be greater than the length of the longest original "
+        "sequence.")
+        .SetDefault(-1);
+    AddComment(R"DOC(
+      Sequence Pad Operator
+      This operator pads sequences in a same batch to a consistent length. 
+      The length is specified by attribute 'padded_length'. New elements, 
+      whose values are specified by input 'PadValue', will be appended to 
+      the end of each sequence, to make their final lengths consistent.
+      Following are cases to better explain how this works:
+      Case 1:
+      Given a 1-level LoDTensor input(X):
+          X.lod = [[0, 2,       5]]
+          X.data = [a, b, c, d, e]
+      and Input(PadValue):
+          PadValue.data = [0]
+      and attribite 'padded_length' = 4,
+      then we get LoDTensor:
+          Out.data = [[a, b, 0, 0], 
+                      [c, d, e, 0]]
+      Case 2:
+      Given a 1-level LoDTensor input(X):
+          X.lod = [[0,               2,                           5]]
+          X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
+      and Input(PadValue):
+          PadValue.data = [0]
+      and attribite 'padded_length' = -1, which mean using the length 
+      of longest input sequence(3 in this case),
+      then we get LoDTensor:
+          Out.data = [[[a1, a2], [b1, b2], [0, 0]], 
+                      [[c1, c2], [d1, d2], [e1, e2]]]
+      Case 3:
+      Given a 1-level LoDTensor input(X):
+          X.lod = [[0,               2,                           5]]
+          X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
+      and Input(PadValue):
+          PadValue.data = [p1, p2]
+      and attribite 'padded_length' = -1, which mean using the length 
+      of longest input sequence(3 in this case),
+      then we get LoDTensor:
+          Out.data = [[[a1, a2], [b1, b2], [p1, p2]], 
+                      [[c1, c2], [d1, d2], [e1, e2]]]
+    )DOC");
+  }
+};
+class SequencePadGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequencePadGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) of SequencePadGradOp should not be null.");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_pad, ops::SequencePadOp, ops::SequencePadOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_pad,
+    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_pad_grad,
+    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/sequence_pad_op.cu
+++ b/paddle/fluid/operators/sequence_pad_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/sequence_pad_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_pad,
+    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_pad_grad,
+    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_pad_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_padding.h"
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+template <typename DeviceContext, typename T>
+class SequencePadOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    const auto* pad_value = ctx.Input<LoDTensor>("PadValue");
+    int padded_length = ctx.Attr<int>("padded_length");
+    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *x, out, *pad_value,
+        padded_length, 0, false, math::kBatchLengthWidth);
+  }
+};
+template <typename DeviceContext, typename T>
+class SequencePadGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    if (d_x) {
+      const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+      d_x->mutable_data<T>(ctx.GetPlace());
+      int padded_length = ctx.Attr<int>("padded_length");
+      math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), *d_out, d_x,
+          padded_length, 0, false, math::kBatchLengthWidth);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/unstack_op.cc
+++ b/paddle/fluid/operators/unstack_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/unstack_op.h"
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+USE_OP(stack);
+REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker,
+                  ops::UnStackOpInferShape, ops::UnStackGradOpDescMaker);
+REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp,
+                  ops::UnStackOpGradInferShape);
--- a/paddle/fluid/operators/unstack_op.h
+++ b/paddle/fluid/operators/unstack_op.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+class UnStackOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist.");
+    int axis = ctx->Attrs().Get<int>("axis");
+    int num = ctx->Attrs().Get<int>("num");
+    auto x_dim = ctx->GetInputDim("X");
+    int rank = x_dim.size();
+    PADDLE_ENFORCE(axis >= -rank && axis < rank,
+                   "Attr(axis) must be inside [-rank, rank), where rank = %d",
+                   rank);
+    if (axis < 0) axis += rank;
+    PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast<size_t>(num),
+                      "Number of Outputs(Y) is wrong");
+    if (x_dim[axis] > 0) {
+      PADDLE_ENFORCE_EQ(num, x_dim[axis], "Number of Outputs(Y) is wrong");
+    }
+    auto vec = framework::vectorize2int(x_dim);
+    vec.erase(vec.begin() + axis);
+    ctx->SetOutputsDim("Y", std::vector<framework::DDim>(  // NOLINT
+                                x_dim[axis], framework::make_ddim(vec)));
+  }
+};
+class UnStackOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input of unstack op.");
+    AddOutput("Y", "The output of unstack op.").AsDuplicable();
+    AddAttr<int>("axis", "The axis along which Input(X) should be unstacked.")
+        .SetDefault(0);
+    AddAttr<int>("num", "The number of outputs(Y).").GreaterThan(0);
+    AddComment(R"DOC(
+      UnStack Operator.
+      UnStack Input(X) into several tensors along Attr(axis).
+    )DOC");
+  }
+};
+class UnStackOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto stack_grad_op = framework::OpRegistry::CreateOp(
+        "stack_grad", {{framework::GradVarName("Y"), {Input("X")}}},
+        {{framework::GradVarName("X"), Outputs("Y")}}, Attrs());
+    stack_grad_op->Run(scope, place);
+  }
+};
+class UnStackOpGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0,
+                      "Number of Inputs(Y@Grad) must be larger than 0");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@Grad) must exist.");
+    auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y"));
+    for (size_t i = 1; i < input_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
+                        "Dims of all Inputs(Y@Grad) must be the same");
+    }
+    int axis = ctx->Attrs().Get<int>("axis");
+    int rank = input_dims[0].size();
+    PADDLE_ENFORCE(
+        axis >= -(rank + 1) && axis < rank + 1,
+        "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank);
+    if (axis < 0) axis += (rank + 1);
+    auto vec = framework::vectorize2int(input_dims[0]);
+    vec.insert(vec.begin() + axis, input_dims.size());
+    ctx->SetOutputDim(framework::GradVarName("X"), framework::make_ddim(vec));
+  }
+};
+class UnStackGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("unstack_grad");
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+class UnStackGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto stack_op = framework::OpRegistry::CreateOp(
+        "stack", {{"X", Inputs(framework::GradVarName("Y"))}},
+        {{"Y", {Output(framework::GradVarName("X"))}}}, Attrs());
+    stack_op->Run(scope, place);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -153,17 +153,29 @@ class WarpCTCKernel : public framework::OpKernel<T> {
        framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
    // warpctc needs sequences data stored in transposed padding format
-    Tensor warpctc_logits;
+    LoDTensor warpctc_logits;
    const size_t max_sequence_length =
-        math::MaximumSequenceLength(logits_lod, level);
+        math::MaximumSequenceLength(logits_lod[level]);
    auto warpctc_logits_dims =
        framework::make_ddim({static_cast<int64_t>(max_sequence_length),
                              static_cast<int64_t>(num_sequences),
                              static_cast<int64_t>(sequence_width)});
    warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
+    LoDTensor cpu_pad_value;
+    T* pad_value_data =
+        cpu_pad_value.mutable_data<T>({1}, platform::CPUPlace());
+    *pad_value_data = static_cast<T>(0);
+    LoDTensor pad_value;
+    if (platform::is_cpu_place(ctx.GetPlace())) {
+      pad_value = cpu_pad_value;
+    } else {
+      TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value);
+    }
    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
        ctx.template device_context<DeviceContext>(), *logits, &warpctc_logits,
-        false);
+        pad_value, -1, 0, false /* norm_by_times */, math::kLengthBatchWidth);
    const T* warpctc_logits_data = warpctc_logits.data<T>();
    std::vector<int> warpctc_label_lengths(num_sequences);
@@ -209,15 +221,15 @@ template <typename DeviceContext, typename T>
 class WarpCTCGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* warpctc_grad = ctx.Input<Tensor>("WarpCTCGrad");
+    auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
    auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
    const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
    logits_grad->mutable_data<T>(ctx.GetPlace());
    bool norm_by_times = ctx.Attr<bool>("norm_by_times");
    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), logits_grad,
+        ctx.template device_context<DeviceContext>(), *warpctc_grad,
-        *warpctc_grad, norm_by_times);
+        logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
    const T* loss_grad_data = loss_grad->data<T>();
    math::ScaleLoDTensorFunctor<DeviceContext, T>()(

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
+if (NOT WIN32)
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)
@@ -10,6 +11,7 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD
        COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif(NOT WIN32)
 if(WITH_GPU)
  nv_library(enforce SRCS enforce.cc)
@@ -58,9 +60,12 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+if (NOT WIN32)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
+endif(NOT WIN32)
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)

--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -22,9 +22,13 @@ limitations under the License. */
 #ifdef __APPLE__
 #include <sys/sysctl.h>
 #include <sys/types.h>
+#elif defined(_WIN32)
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#include <windows.h>
 #else
 #include <unistd.h>
-#endif
+#endif  // _WIN32
 #include <algorithm>
 #include "gflags/gflags.h"
@@ -32,16 +36,20 @@ limitations under the License. */
 DEFINE_double(fraction_of_cpu_memory_to_use, 1,
              "Default use 100% of CPU memory for PaddlePaddle,"
              "reserve the rest for page tables, etc");
+#if !defined(_WIN32)
 DEFINE_uint64(initial_cpu_memory_in_mb,
 #ifdef PADDLE_WITH_MKLDNN
              /* Aligned with mozga-intel, MKLDNN need at least 5000 MB
               * to obtain the best performance*/
-              5000,
+              5000ul,
 #else
-              500,
+              500ul,
 #endif
              "Initial CPU memory for PaddlePaddle, in MD unit.");
+#else
+DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
+              "Initial CPU memory for PaddlePaddle, in MD unit.");
+#endif  // !defined(_WIN32)
 DEFINE_double(
    fraction_of_cuda_pinned_memory_to_use, 0.5,
@@ -60,6 +68,11 @@ inline size_t CpuTotalPhysicalMemory() {
  size_t len = sizeof(size);
  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
  return 0L;
+#elif defined(_WIN32)
+  MEMORYSTATUSEX sMeminfo;
+  sMeminfo.dwLength = sizeof(sMeminfo);
+  GlobalMemoryStatusEx(&sMeminfo);
+  return sMeminfo.ullTotalPhys;
 #else
  int64_t pages = sysconf(_SC_PHYS_PAGES);
  int64_t page_size = sysconf(_SC_PAGE_SIZE);

--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#if !defined(_WIN32)
 #include <sys/time.h>
+#else
+#include <windows.h>
+#endif  // !_WIN32
 #include <time.h>
 #include <chrono>  // NOLINT
 #include <string>
@@ -27,12 +32,15 @@ namespace platform {
 ///////////////////////
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
+#if !defined(_WIN32)
 inline uint64_t PosixInNsec() {
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
+#else
+inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
+#endif  // !_WIN32
 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -16,7 +16,9 @@ if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
+if (NOT WIN32)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+endif(NOT WIN32)
 if (WITH_MKLML)
    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include <dlfcn.h>
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
@@ -23,6 +21,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/port.h"
 DEFINE_string(cudnn_dir, "",
              "Specify path for loading libcudnn.so. For instance, "

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -18,6 +18,11 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__
+#if defined(_WIN32)
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#endif
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -117,7 +122,12 @@ struct EOFException : public std::exception {
 // always forces branch prediction of true.
 // This generates faster binary code. __builtin_expect is since C++11.
 // For more details, please check https://stackoverflow.com/a/43870188/724872.
+#if !defined(_WIN32)
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#else
+// there is no equivalent intrinsics in msvc.
+#define UNLIKELY(condition) (condition == 0)
+#endif
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
@@ -230,6 +240,7 @@ inline void throw_on_error(T e) {
  throw_on_error(e, "");
 }
+#if !defined(_WIN32)
 #define PADDLE_THROW(...)                                              \
  do {                                                                 \
    throw ::paddle::platform::EnforceNotMet(                           \
@@ -248,15 +259,28 @@ inline void throw_on_error(T e) {
                                              __FILE__, __LINE__);      \
    }                                                                   \
  } while (false)
-#else
-#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
-#endif
 #define PADDLE_THROW_EOF()                                                     \
  do {                                                                         \
    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
                                           __LINE__);                          \
  } while (false)
+#else
+#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__)
+#endif  // REPLACE_ENFORCE_GLOG
+#else  // !_WIN32
+// disable enforce, caused by the varardic macro exception error
+#define PADDLE_THROW(x)                                      \
+  do {                                                       \
+    throw std::make_exception_ptr(                           \
+        std::runtime_error("Windows disable the enforce.")); \
+  } while (false)
+#define PADDLE_ENFORCE(x, ...) x
+#endif  // !_WIN32
 /*
 * Some enforce helpers here, usage:
 *    int a = 1;

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -85,9 +85,6 @@ void InitDevices(bool init_p2p) {
  } catch (const std::exception &exp) {
    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
  }
-#else
-  LOG(WARNING)
-      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
 #endif
  InitDevices(init_p2p, devices);
 }
@@ -101,9 +98,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
  } catch (const std::exception &exp) {
    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
  }
-#else
-  LOG(WARNING)
-      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
 #endif
  for (size_t i = 0; i < devices.size(); ++i) {

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -69,6 +69,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
+#if !defined(_WIN32)
 struct RecordEvent {
  RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
@@ -94,6 +95,15 @@ struct RecordBlock {
  std::string name_;
  uint64_t start_ns_;
 };
+#else
+// windows do not support profiler temporarily.
+struct RecordEvent {
+  RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
+};
+struct RecordBlock {
+  explicit RecordBlock(int block_id) {}
+};
+#endif
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -234,6 +234,7 @@ void BindVarDsec(pybind11::module *m) {
  pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
      .value("BOOL", pd::proto::VarType::BOOL)
      .value("UINT8", pd::proto::VarType::UINT8)
+      .value("INT8", pd::proto::VarType::INT8)
      .value("INT16", pd::proto::VarType::INT16)
      .value("INT32", pd::proto::VarType::INT32)
      .value("INT64", pd::proto::VarType::INT64)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -130,6 +130,7 @@ PYBIND11_PLUGIN(core) {
      .def("set", PyCPUTensorSetFromArray<bool>)
      .def("set", PyCPUTensorSetFromArray<uint16_t>)
      .def("set", PyCPUTensorSetFromArray<uint8_t>)
+      .def("set", PyCPUTensorSetFromArray<int8_t>)
 #ifdef PADDLE_WITH_CUDA
      .def("set", PyCUDATensorSetFromArray<float>)
      .def("set", PyCUDATensorSetFromArray<int>)
@@ -138,6 +139,7 @@ PYBIND11_PLUGIN(core) {
      .def("set", PyCUDATensorSetFromArray<bool>)
      .def("set", PyCUDATensorSetFromArray<uint16_t>)
      .def("set", PyCUDATensorSetFromArray<uint8_t>)
+      .def("set", PyCUDATensorSetFromArray<int8_t>)
      .def("set", PyCUDAPinnedTensorSetFromArray<float>)
      .def("set", PyCUDAPinnedTensorSetFromArray<int>)
      .def("set", PyCUDAPinnedTensorSetFromArray<double>)
@@ -145,6 +147,7 @@ PYBIND11_PLUGIN(core) {
      .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
      .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
      .def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<int8_t>)
 #endif
      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
      .def("_set_float_element", TensorSetElement<float>)

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
  auto buffer_info =
      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
-                                  uint8_t, platform::float16>()(tensor);
+                                  uint8_t, int8_t, platform::float16>()(tensor);
  return buffer_info;
 }

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -335,12 +335,18 @@ function assert_api_not_changed() {
    fi
    python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
    deactivate
+}
+function assert_api_spec_approvals() {
+    if [ -z ${BRANCH} ]; then
+        BRANCH="develop"
+    fi
-    API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true`
+    API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/API.spec" || true`
    echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
    if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
-        # TODO: curl -H 'Authorization: token ${TOKEN}'
+        # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews | \
+        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
        python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
        if [ "${APPROVALS}" == "FALSE" ]; then
@@ -622,11 +628,12 @@ function main() {
      cicheck)
        cmake_gen ${PYTHON_ABI:-""}
        build
+        assert_api_not_changed ${PYTHON_ABI:-""}
        run_test
        gen_capi_package
        gen_fluid_inference_lib
        test_fluid_inference_lib
-        assert_api_not_changed ${PYTHON_ABI:-""}
+        assert_api_spec_approvals
        ;;
      *)
        print_usage

--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -203,7 +203,7 @@ def resize_short(im, size):
        h_new = size * h // w
    else:
        w_new = size * w // h
-    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_CUBIC)
    return im
@@ -345,7 +345,6 @@ def simple_transform(im,
        if np.random.randint(2) == 0:
            im = left_right_flip(im, is_color)
    else:
-        im = center_crop(im, crop_size, is_color)
        im = center_crop(im, crop_size, is_color=is_color)
    if len(im.shape) == 3:
        im = to_chw(im)

--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -24,6 +24,7 @@ set and test set into paddle reader creators.
 from __future__ import print_function
+import numpy as np
 import zipfile
 import paddle.dataset.common
 import re
@@ -150,12 +151,12 @@ def __initialize_meta_info__():
 def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
    fn = __initialize_meta_info__()
-    rand = random.Random(x=rand_seed)
+    np.random.seed(rand_seed)
    with zipfile.ZipFile(file=fn) as package:
        with package.open('ml-1m/ratings.dat') as rating:
            for line in rating:
                line = cpt.to_text(line, encoding='latin')
-                if (rand.random() < test_ratio) == is_test:
+                if (np.random.random() < test_ratio) == is_test:
                    uid, mov_id, rating, _ = line.strip().split("::")
                    uid = int(uid)
                    mov_id = int(mov_id)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -95,6 +95,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
        return core.VarDesc.VarType.INT16
    elif dtype == np.uint8:
        return core.VarDesc.VarType.UINT8
+    elif dtype == np.int8:
+        return core.VarDesc.VarType.INT8
    else:
        raise ValueError("Not supported numpy dtype %s" % dtype)

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -189,7 +189,6 @@ def Print(input,
               message="The content of some_layer: ")
    '''
    helper = LayerHelper('print', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
    helper.append_op(
        type='print',
        inputs={'In': input},
@@ -202,9 +201,7 @@ def Print(input,
            'print_tensor_shape': print_tensor_shape,
            'print_tensor_lod': print_tensor_lod,
            'print_phase': print_phase.upper()
-        },
+        })
-        outputs={'Out': out})
-    return out
 class BlockGuard(object):

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -39,6 +39,7 @@ __all__ = [
    'detection_map',
    'rpn_target_assign',
    'anchor_generator',
+    'generate_proposals',
 ]
 __auto__ = [
@@ -1253,3 +1254,73 @@ def anchor_generator(input,
    anchor.stop_gradient = True
    var.stop_gradient = True
    return anchor, var
+def generate_proposals(scores,
+                       bbox_deltas,
+                       im_info,
+                       anchors,
+                       variances,
+                       pre_nms_top_n=6000,
+                       post_nms_top_n=1000,
+                       nms_thresh=0.5,
+                       min_size=0.1,
+                       eta=1.0,
+                       name=None):
+    """
+    ** Generate proposal labels Faster-RCNN **
+	This operation proposes RoIs according to each box with their probability to be a foreground object and 
+	the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
+	could be used to train detection net.
+	For generating proposals, this operation performs following steps:
+	1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
+ 	2. Calculate box locations as proposals candidates. 
+	3. Clip boxes to image
+	4. Remove predicted boxes with small area. 
+	5. Apply NMS to get final proposals as output.
+	Args:
+		scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
+			N is batch size, A is number of anchors, H and W are height and width of the feature map.
+		bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
+		im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
+			between origin image size and the size of feature map.
+		anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
+              		num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
+		variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
+		pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
+		post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
+		nms_thresh(float): Threshold in NMS, 0.5 by default.
+		min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
+		eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
+    """
+    helper = LayerHelper('generate_proposals', **locals())
+    rpn_rois = helper.create_tmp_variable(dtype=bbox_deltas.dtype)
+    rpn_roi_probs = helper.create_tmp_variable(dtype=scores.dtype)
+    helper.append_op(
+        type="generate_proposals",
+        inputs={
+            'Scores': scores,
+            'BboxDeltas': bbox_deltas,
+            'ImInfo': im_info,
+            'Anchors': anchors,
+            'Variances': variances
+        },
+        attrs={
+            'pre_nms_topN': pre_nms_top_n,
+            'post_nms_topN': post_nms_top_n,
+            'nms_thresh': nms_thresh,
+            'min_size': min_size,
+            'eta': eta
+        },
+        outputs={'RpnRois': rpn_rois,
+                 'RpnRoiProbs': rpn_roi_probs})
+    rpn_rois.stop_gradient = True
+    rpn_roi_probs.stop_gradient = True
+    return rpn_rois, rpn_roi_probs
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -246,7 +246,11 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True):
            rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
        })
    if sync:
-        helper.append_op(type="send_barrier", attrs={"endpoints": endpoints})
+        helper.append_op(
+            type="send_barrier",
+            inputs={"X": dummy_output},
+            outputs={"Out": []},
+            attrs={"endpoints": endpoints})
 def Recv(endpoints, get_vars, dummy_input=None, sync=True):
@@ -282,7 +286,10 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True):
        attrs={"endpoints": endpoints,
               "epmap": epmap})
    if sync:
-        helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints})
+        helper.append_op(
+            type="fetch_barrier",
+            outputs={"Out": get_vars},
+            attrs={"endpoints": endpoints})
    return get_vars

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -17,6 +17,7 @@ All layers just related to the neural network.
 from __future__ import print_function
+import numpy as np
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
@@ -24,7 +25,6 @@ from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc
 from .tensor import concat
 from . import utils
-import random
 from .. import unique_name
 from functools import reduce
@@ -54,6 +54,7 @@ __all__ = [
    'conv2d_transpose',
    'conv3d_transpose',
    'sequence_expand',
+    'sequence_pad',
    'lstm_unit',
    'reduce_sum',
    'reduce_mean',
@@ -87,6 +88,7 @@ __all__ = [
    'lod_reset',
    'lrn',
    'pad',
+    'pad_constant_like',
    'label_smooth',
    'roi_pool',
    'dice_loss',
@@ -105,6 +107,7 @@ __all__ = [
    'flatten',
    'sequence_mask',
    'stack',
+    'unstack',
    'sequence_enumerate',
 ]
@@ -2656,6 +2659,51 @@ def sequence_expand(x, y, ref_level=-1, name=None):
    return tmp
+@templatedoc()
+def sequence_pad(x, pad_value, maxlen=None):
+    """
+    ${comment}
+    Args:
+        x(Variable): Input variable which should contain lod information.
+        pad_value(Variable): The Variable that holds values that will be fill 
+            into padded steps. It can be a scalar or a tensor whose shape 
+            equals to time steps in sequences. If it's a scalar, it will be 
+            automatically broadcasted to the shape of time step.
+        maxlen(int, default None): The length of padded sequences. It can be 
+            None or any positive int. When it is None, all sequences will be 
+            padded up to the length of the longest one among them; when it a 
+            certain positive value, it must be greater than the length of the 
+            longest original sequence."
+    Returns:
+        Variable: The padded sequence batch. All sequences has the same length.
+    Examples:
+        .. code-block:: python
+            import numpy
+            x = fluid.layers.data(name='y', shape=[10, 5],
+                             dtype='float32', lod_level=1)
+            pad_value = fluid.layers.assign(input=numpy.array([0]))
+            out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+    """
+    helper = LayerHelper('sequence_pad', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    if maxlen is None:
+        maxlen = -1
+    helper.append_op(
+        type='sequence_pad',
+        inputs={'X': x,
+                'PadValue': pad_value},
+        outputs={'Out': out},
+        attrs={'padded_length': maxlen})
+    return out
 def beam_search(pre_ids,
                pre_scores,
                ids,
@@ -4709,6 +4757,86 @@ def pad(x, paddings, pad_value=0., name=None):
    return out
+def pad_constant_like(x, y, pad_value=0., name=None):
+    """
+    Pad input(Y) with :attr:`pad_value`, the number of values padded to
+    the edges of each axis is specified by the difference of the shape
+    of X and Y. ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n))
+    unique pad widths for each axis. The input should be a k-D
+    tensor(k > 0 and k < 7).
+    See below for an example.
+    .. code-block:: text
+        Given:
+            X = [[[[ 0,  1,  2],
+                   [ 3,  4,  5]],
+                  [[ 6,  7,  8],
+                   [ 9, 10, 11]],
+                  [[12, 13, 14],
+                   [15, 16, 17]]],
+                 [[[18, 19, 20],
+                   [21, 22, 23]],
+                  [[24, 25, 26],
+                   [27, 28, 29]],
+                  [[30, 31, 32],
+                   [33, 34, 35]]]]
+            X.shape = (2, 3, 2, 3)
+            Y = [[[[35, 36, 37]],
+                  [[38, 39, 40]],
+                  [[41, 42, 43]]]]
+            Y.shape = (1, 3, 1, 3)
+    And
+        pad_value = -1,
+    Return:
+        Out = [[[[35, 36, 37],
+                  [-1, -1, -1]],
+                [[38, 39, 40],
+                  [-1, -1, -1]],
+                 [[41, 42, 43],
+                  [-1, -1, -1]]],
+                [[[-1, -1, -1],
+                  [-1, -1, -1]],
+                 [[-1, -1, -1],
+                  [-1, -1, -1]],
+                 [[-1, -1, -1],
+                  [-1, -1, -1]]]]
+        Out.shape = (2, 3, 2, 3)
+    Args:
+        x (Variable): The input tensor variable.
+        y (Variable): The input tensor variable.
+        pad_value (float): The constant value used to pad.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+    Returns:
+        Variable: The padded tensor variable.
+    Examples:
+        .. code-block:: python
+            # x is a rank 4 tensor variable, x.shape = (2, 3, 2, 3)
+            # y is a rank 4 tensor variable, y.shape = (1, 3, 1, 3)
+            out = fluid.layers.pad_constant_like(x=x, y=y, pad_value=0.)
+            # out is a rank 4 tensor variable, and out.shape = [2, 3 ,2 , 3]
+    """
+    helper = LayerHelper('pad_constant_like', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='pad_constant_like',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': out},
+        attrs={'pad_value': float(pad_value)})
+    return out
 def label_smooth(label,
                 prior_dist=None,
                 epsilon=0.1,
@@ -5103,7 +5231,7 @@ def random_crop(x, shape, seed=None):
    dtype = x.dtype
    out = helper.create_tmp_variable(dtype)
    if seed is None:
-        seed = random.randint(-65536, 65535)
+        seed = np.random.randint(-65536, 65536)
    op_attrs = {"shape": shape}
    if isinstance(seed, int):
        op_attrs["startup_seed"] = seed
@@ -5305,7 +5433,7 @@ def crop(x, shape=None, offsets=None, name=None):
    helper = LayerHelper('crop', **locals())
    if not (isinstance(shape, list) or isinstance(shape, tuple) or \
-        isinstance(shape, Variable)):
+                    isinstance(shape, Variable)):
        raise ValueError("The shape should be a list, tuple or Variable.")
    if offsets is None:
@@ -5417,7 +5545,7 @@ def prelu(x, mode, param_attr=None, name=None):
 		       channel:elements in a channel share same weight
 		       element:each element has a weight
 	  name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically. 
+                        will be named automatically.
    Returns:
        Variable: The output tensor with the same shape as input.
@@ -5576,23 +5704,23 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
    Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
    :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
    .. math::
        y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
    Args:
-        x (Variable): Input tensor of sequence_mask layer, 
+        x (Variable): Input tensor of sequence_mask layer,
                      whose elements are integers less than :code:`maxlen`.
        maxlen (int|None): Maximum length of the sequence. If :code:`maxlen`
                           is None, it would be replace with :math:`max(x)`.
        dtype (np.dtype|core.VarDesc.VarType|str): Data type of the output.
-        name (str|None): A name for this layer(optional). If set None, the 
+        name (str|None): A name for this layer(optional). If set None, the
-                         layer will be named automatically.  
+                         layer will be named automatically.
    Returns:
        Variable: The output sequence mask.
    """
    helper = LayerHelper('sequence_mask', **locals())
@@ -5617,23 +5745,23 @@ def stack(x, axis=0):
    **Stack Layer**
    This layer stacks all of the input :code:`x` along axis.
-    Input :code:`x` can be a single variable, a :code:`list` of variables, 
+    Input :code:`x` can be a single variable, a :code:`list` of variables,
-    or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or 
+    or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or
-    :code:`tuple`, the shapes of all these variables must be the same.  
+    :code:`tuple`, the shapes of all these variables must be the same.
-    Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`, 
+    Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`,
-    the shape of the output variable would be 
+    the shape of the output variable would be
-    :math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`. 
+    :math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`.
    If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`.
-    If :code:`axis` is None, it would be replaced with 0. 
+    If :code:`axis` is None, it would be replaced with 0.
    Args:
-        x (Variable|list(Variable)|tuple(Variable)): Input variables. 
+        x (Variable|list(Variable)|tuple(Variable)): Input variables.
        axis (int|None): The axis along which all inputs are stacked.
    Returns:
        Variable: The stacked variable.
    """
    helper = LayerHelper('stack', **locals())
@@ -5648,3 +5776,44 @@ def stack(x, axis=0):
        attrs={'axis': axis})
    return out
+def unstack(x, axis=0, num=None):
+    """
+    **UnStack Layer**
+    This layer unstacks input :code:`x` into several tensors along axis.
+    If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`.
+    If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`,
+    and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is
+    raised. 
+    Args:
+        x (Variable): Input variable. 
+        axis (int): The axis along which the input is unstacked.
+        num (int|None): The number of output variables.
+    Returns:
+        list(Variable): The unstacked variables.
+    """
+    helper = LayerHelper('unstack', **locals())
+    if num is None:
+        if axis is None or x.shape[axis] <= 0:
+            raise ValueError('unknown unstack number')
+        else:
+            num = x.shape[axis]
+    outs = []
+    for _ in num:
+        outs.append(helper.create_tmp_variable(x.dtype))
+    helper.append_op(
+        type='unstack',
+        inputs={'X': [x]},
+        outputs={'Y': outs},
+        attrs={'axis': axis,
+               'num': num})
+    return outs
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -46,10 +46,12 @@ class Optimizer(object):
    def __init__(self,
                 learning_rate,
                 regularization=None,
-                 LARS_weight_decay=0.0):
+                 LARS_weight_decay=0.0,
+                 name=None):
        if not isinstance(learning_rate, float) and \
                not isinstance(learning_rate, framework.Variable):
            raise TypeError("learning rate should be float or Variable")
+        self._name = name
        self.regularization = regularization
        self._learning_rate = learning_rate
        # the learning rate type should be inferenced from loss
@@ -153,6 +155,8 @@ class Optimizer(object):
            dtype: data type of the accumulator variable
            fill_value: value to initialize the accumulator variable
        """
+        if self._name is not None:
+            name = self._name + "_" + name
        if (name in self._accumulators and
                param.name in self._accumulators[name]):
            raise Exception("Accumulator {} already exists for parameter {}".
@@ -181,6 +185,8 @@ class Optimizer(object):
        Returns:
            accumulator variable for the parameter
        """
+        if self._name is not None:
+            name = self._name + "_" + name
        if (name not in self._accumulators or
                param.name not in self._accumulators[name]):
            raise Exception("Accumulator {} does not exist for parameter {}".

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -125,8 +125,8 @@ opts = optimizer.minimize(avg_cost)
 batch_size = fluid.layers.create_tensor(dtype='int64')
 batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size)
-# fluid.memory_optimize(fluid.default_main_program(), level=0)
+fluid.memory_optimize(fluid.default_main_program(), level=0)
-fluid.release_memory(fluid.default_main_program())
+# fluid.release_memory(fluid.default_main_program())
 BATCH_SIZE = 16
 PASS_NUM = 1

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -92,8 +92,8 @@ def main():
    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
    optimizer.minimize(avg_cost)
-    # fluid.memory_optimize(fluid.default_main_program())
+    fluid.memory_optimize(fluid.default_main_program())
-    fluid.release_memory(fluid.default_main_program())
+    # fluid.release_memory(fluid.default_main_program())
    # fix the order of training data
    train_data = paddle.batch(

--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -201,5 +201,44 @@ class TestDetectionMAP(unittest.TestCase):
        print(str(program))
+class TestGenerateProposals(unittest.TestCase):
+    def test_generate_proposals(self):
+        data_shape = [20, 64, 64]
+        images = fluid.layers.data(
+            name='images', shape=data_shape, dtype='float32')
+        im_info = fluid.layers.data(
+            name='im_info', shape=[1, 3], dtype='float32')
+        anchors, variances = fluid.layers.anchor_generator(
+            name='anchor_generator',
+            input=images,
+            anchor_sizes=[32, 64],
+            aspect_ratios=[1.0],
+            variance=[0.1, 0.1, 0.2, 0.2],
+            stride=[16.0, 16.0],
+            offset=0.5)
+        num_anchors = anchors.shape[2]
+        scores = fluid.layers.data(
+            name='scores', shape=[1, num_anchors, 8, 8], dtype='float32')
+        bbox_deltas = fluid.layers.data(
+            name='bbox_deltas',
+            shape=[1, num_anchors * 4, 8, 8],
+            dtype='float32')
+        rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
+            name='generate_proposals',
+            scores=scores,
+            bbox_deltas=bbox_deltas,
+            im_info=im_info,
+            anchors=anchors,
+            variances=variances,
+            pre_nms_top_n=6000,
+            post_nms_top_n=1000,
+            nms_thresh=0.5,
+            min_size=0.1,
+            eta=1.0)
+        self.assertIsNotNone(rpn_rois)
+        self.assertIsNotNone(rpn_roi_probs)
+        print(rpn_rois.shape)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -134,7 +134,7 @@ class SE_ResNeXt():
            size=class_dim,
            act='softmax',
            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.2)))
+                initializer=fluid.initializer.Constant(value=0.05)))
        return out
    def shortcut(self, input, ch_out, stride):
@@ -184,7 +184,7 @@ class SE_ResNeXt():
            act=None,
            # avoid pserver CPU init differs from GPU
            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.2)),
+                initializer=fluid.initializer.Constant(value=0.05)),
            bias_attr=False)
        return fluid.layers.batch_norm(input=conv, act=act)
@@ -192,13 +192,19 @@ class SE_ResNeXt():
        pool = fluid.layers.pool2d(
            input=input, pool_size=0, pool_type='avg', global_pooling=True)
        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-        squeeze = fluid.layers.fc(input=pool,
+        squeeze = fluid.layers.fc(
-                                  size=num_channels // reduction_ratio,
+            input=pool,
-                                  act='relu')
+            size=num_channels // reduction_ratio,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.05)),
+            act='relu')
        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
-        excitation = fluid.layers.fc(input=squeeze,
+        excitation = fluid.layers.fc(
-                                     size=num_channels,
+            input=squeeze,
-                                     act='sigmoid')
+            size=num_channels,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.05)),
+            act='sigmoid')
        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
        return scale

--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -18,54 +18,129 @@ import numpy as np
 import argparse
 import time
 import math
+import os
+import sys
+import six
+import argparse
+import ast
+import multiprocessing
+import time
+from functools import partial
+from os.path import expanduser
+import glob
+import random
+import tarfile
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.layers as layers
 from paddle.fluid import core
-import os
+from test_dist_base import TestDistRunnerBase, runtime_main
-import sys
+from paddle.compat import long_type
-import six
-import transformer_model
+import hashlib
-import paddle.dataset.wmt16 as wmt16
+from paddle.fluid.transpiler.details import program_to_code
+const_para_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(0.001))
+const_bias_attr = const_para_attr
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
 fluid.default_main_program().random_seed = 1
-WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
+#from transformer_config import ModelHyperParams, TrainTaskConfig, merge_cfg_from_list
+class TrainTaskConfig(object):
+    # only support GPU currently
+    use_gpu = True
+    # the epoch number to train.
+    pass_num = 1
+    # the number of sequences contained in a mini-batch.
+    # deprecated, set batch_size in args.
+    batch_size = 20
+    # the hyper parameters for Adam optimizer.
+    # This static learning_rate will be multiplied to the LearningRateScheduler
+    # derived learning rate the to get the final learning rate.
+    learning_rate = 1
+    beta1 = 0.9
+    beta2 = 0.98
+    eps = 1e-9
+    # the parameters for learning rate scheduling.
+    warmup_steps = 4000
+    # the weight used to mix up the ground-truth distribution and the fixed
+    # uniform distribution in label smoothing when training.
+    # Set this as zero if label smoothing is not wanted.
+    label_smooth_eps = 0.1
+    # the directory for saving trained models.
+    model_dir = "trained_models"
+    # the directory for saving checkpoints.
+    ckpt_dir = "trained_ckpts"
+    # the directory for loading checkpoint.
+    # If provided, continue training from the checkpoint.
+    ckpt_path = None
+    # the parameter to initialize the learning rate scheduler.
+    # It should be provided if use checkpoints, since the checkpoint doesn't
+    # include the training step counter currently.
+    start_step = 0
-class ModelHyperParams(object):
+    check_acc = True
-    # Dictionary size for source and target language. This model directly uses
-    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
-    # alreay been added, but the <pad> token is not added. Transformer requires
-    # sequences in a mini-batch are padded to have the same length. A <pad> token is
-    # added into the original dictionary in paddle.dateset.wmt16.
-    # size of source word dictionary.
+    data_path = expanduser("~") + (
-    src_vocab_size = 10000
+        "/.cache/paddle/dataset/test_dist_transformer/")
-    # index for <pad> token in source language.
+    src_vocab_fpath = data_path + "vocab.bpe.32000"
-    src_pad_idx = src_vocab_size
+    trg_vocab_fpath = data_path + "vocab.bpe.32000"
+    train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de"
+    val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de"
+    pool_size = 2000
+    sort_type = None
+    local = True
+    shuffle = False
+    shuffle_batch = False
+    special_token = ['<s>', '<e>', '<unk>']
+    token_delimiter = ' '
+    use_token_batch = False
-    # size of target word dictionay
-    trg_vocab_size = 10000
-    # index for <pad> token in target language.
-    trg_pad_idx = trg_vocab_size
-    # position value corresponding to the <pad> token.
+class InferTaskConfig(object):
-    pos_pad_idx = 0
+    use_gpu = True
+    # the number of examples in one run for sequence generation.
+    batch_size = 10
+    # the parameters for beam search.
+    beam_size = 5
+    max_out_len = 256
+    # the number of decoded sentences to output.
+    n_best = 1
+    # the flags indicating whether to output the special tokens.
+    output_bos = False
+    output_eos = False
+    output_unk = True
+    # the directory for loading the trained model.
+    model_path = "trained_models/pass_1.infer.model"
-    # max length of sequences. It should plus 1 to include position
-    # padding token for position encoding.
-    max_length = 50
+class ModelHyperParams(object):
+    # These following five vocabularies related configurations will be set
+    # automatically according to the passed vocabulary path and special tokens.
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <bos> token
+    bos_idx = 0
+    # index for <eos> token
+    eos_idx = 1
+    # index for <unk> token
+    unk_idx = 2
+    # max length of sequences deciding the size of position encoding table.
+    # Start from 1 and count start and end tokens in.
+    max_length = 256
    # the dimension for word embeddings, which is also the last dimension of
    # the input and output of multi-head attention, position-wise feed-forward
    # networks, encoder and decoder.
    d_model = 512
    # size of the hidden layer in position-wise feed-forward networks.
-    d_inner_hid = 1024
+    d_inner_hid = 2048
    # the dimension that keys are projected to for dot-product attention.
    d_key = 64
    # the dimension that values are projected to for dot-product attention.
@@ -75,95 +150,1521 @@ class ModelHyperParams(object):
    # number of sub-layers to be stacked in the encoder and decoder.
    n_layer = 6
    # dropout rate used by all dropout layers.
-    dropout = 0.1
+    dropout = 0.0  # no random
+    # random seed used in dropout for CE.
+    dropout_seed = None
+    # the flag indicating whether to share embedding and softmax weights.
+    # vocabularies in source and target should be same for weight sharing.
+    weight_sharing = True
-def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+def merge_cfg_from_list(cfg_list, g_cfgs):
+    """
+    Set the above global configurations using the cfg_list.
+    """
+    assert len(cfg_list) % 2 == 0
+    for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
+        for g_cfg in g_cfgs:
+            if hasattr(g_cfg, key):
+                try:
+                    value = eval(value)
+                except Exception:  # for file path
+                    pass
+                setattr(g_cfg, key, value)
+                break
+# The placeholder for batch_size in compile time. Must be -1 currently to be
+# consistent with some ops' infer-shape output in compile time, such as the
+# sequence_expand op used in beamsearch decoder.
+batch_size = -1
+# The placeholder for squence length in compile time.
+seq_len = ModelHyperParams.max_length
+# Here list the data shapes and data types of all inputs.
+# The shapes here act as placeholder and are set to pass the infer-shape in
+# compile time.
+input_descs = {
+    # The actual data shape of src_word is:
+    # [batch_size * max_src_len_in_batch, 1]
+    "src_word": [(batch_size, seq_len, long_type(1)), "int64", 2],
+    # The actual data shape of src_pos is:
+    # [batch_size * max_src_len_in_batch, 1]
+    "src_pos": [(batch_size, seq_len, long_type(1)), "int64"],
+    # This input is used to remove attention weights on paddings in the
+    # encoder.
+    # The actual data shape of src_slf_attn_bias is:
+    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
+    "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # The actual data shape of trg_word is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "trg_word": [(batch_size, seq_len, long_type(1)), "int64",
+                 2],  # lod_level is only used in fast decoder.
+    # The actual data shape of trg_pos is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "trg_pos": [(batch_size, seq_len, long_type(1)), "int64"],
+    # This input is used to remove attention weights on paddings and
+    # subsequent words in the decoder.
+    # The actual data shape of trg_slf_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
+    "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # This input is used to remove attention weights on paddings of the source
+    # input in the encoder-decoder attention.
+    # The actual data shape of trg_src_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
+    "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # This input is used in independent decoder program for inference.
+    # The actual data shape of enc_output is:
+    # [batch_size, max_src_len_in_batch, d_model]
+    "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
+    # The actual data shape of label_word is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_word": [(batch_size * seq_len, long_type(1)), "int64"],
+    # This input is used to mask out the loss of paddding tokens.
+    # The actual data shape of label_weight is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_weight": [(batch_size * seq_len, long_type(1)), "float32"],
+    # These inputs are used to change the shape tensor in beam-search decoder.
+    "trg_slf_attn_pre_softmax_shape_delta": [(long_type(2), ), "int32"],
+    "trg_slf_attn_post_softmax_shape_delta": [(long_type(4), ), "int32"],
+    "init_score": [(batch_size, long_type(1)), "float32"],
+}
+# Names of word embedding table which might be reused for weight sharing.
+word_emb_param_names = (
+    "src_word_emb_table",
+    "trg_word_emb_table", )
+# Names of position encoding table which will be initialized externally.
+pos_enc_param_names = (
+    "src_pos_enc_table",
+    "trg_pos_enc_table", )
+# separated inputs for different usages.
+encoder_data_input_fields = (
+    "src_word",
+    "src_pos",
+    "src_slf_attn_bias", )
+decoder_data_input_fields = (
+    "trg_word",
+    "trg_pos",
+    "trg_slf_attn_bias",
+    "trg_src_attn_bias",
+    "enc_output", )
+label_data_input_fields = (
+    "lbl_word",
+    "lbl_weight", )
+# In fast decoder, trg_pos (only containing the current time step) is generated
+# by ops and trg_slf_attn_bias is not needed.
+fast_decoder_data_input_fields = (
+    "trg_word",
+    "init_score",
+    "trg_src_attn_bias", )
+# fast_decoder_util_input_fields = (
+#     "trg_slf_attn_pre_softmax_shape_delta",
+#     "trg_slf_attn_post_softmax_shape_delta", )
+#from optim import LearningRateScheduler
+class LearningRateScheduler(object):
+    """
+    Wrapper for learning rate scheduling as described in the Transformer paper.
+    LearningRateScheduler adapts the learning rate externally and the adapted
+    learning rate will be feeded into the main_program as input data.
+    """
+    def __init__(self,
+                 d_model,
+                 warmup_steps,
+                 learning_rate=0.001,
+                 current_steps=0,
+                 name="learning_rate"):
+        self.current_steps = current_steps
+        self.warmup_steps = warmup_steps
+        self.d_model = d_model
+        self.static_lr = learning_rate
+        self.learning_rate = layers.create_global_var(
+            name=name,
+            shape=[1],
+            value=float(learning_rate),
+            dtype="float32",
+            persistable=True)
+    def update_learning_rate(self):
+        self.current_steps += 1
+        lr_value = np.power(self.d_model, -0.5) * np.min([
+            np.power(self.current_steps, -0.5),
+            np.power(self.warmup_steps, -1.5) * self.current_steps
+        ]) * self.static_lr
+        return np.array([lr_value], dtype="float32")
+#from transformer_train import train_loop
+def pad_batch_data(insts,
+                   pad_idx,
+                   n_head,
+                   is_target=False,
+                   is_label=False,
+                   return_attn_bias=True,
+                   return_max_len=True,
+                   return_num_token=False):
    """
    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and attention bias. Then, convert the numpy
+    corresponding position data and attention bias.
-    data to tensors and return a dict mapping names to tensors.
    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    num_token = reduce(lambda x, y: x + y,
+                       [len(inst) for inst in insts]) if return_num_token else 0
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+    inst_data = np.array(
+        [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+    return_list += [inst_data.astype("int64").reshape([-1, 1])]
+    if is_label:  # label weight
+        inst_weight = np.array(
+            [[1.] * len(inst) + [0.] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
+    else:  # position data
+        inst_pos = np.array([
+            range(1, len(inst) + 1) + [0] * (max_len - len(inst))
+            for inst in insts
+        ])
+        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+    if return_attn_bias:
+        if is_target:
+            # This is used to avoid attention on paddings and subsequent
+            # words.
+            slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, max_len))
+            slf_attn_bias_data = np.triu(slf_attn_bias_data,
+                                         1).reshape([-1, 1, max_len, max_len])
+            slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                         [1, n_head, 1, 1]) * [-1e9]
+        else:
+            # This is used to avoid attention on paddings.
+            slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                           (max_len - len(inst))
+                                           for inst in insts])
+            slf_attn_bias_data = np.tile(
+                slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                [1, n_head, max_len, 1])
+        return_list += [slf_attn_bias_data.astype("float32")]
+    if return_max_len:
+        return_list += [max_len]
+    if return_num_token:
+        return_list += [num_token]
+    return return_list if len(return_list) > 1 else return_list[0]
+def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx,
+                        n_head, d_model):
+    """
+    Put all padded data needed by training into a dict.
+    """
+    src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False)
+    src_word = src_word.reshape(-1, src_max_len, 1)
+    src_pos = src_pos.reshape(-1, src_max_len, 1)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True)
+    trg_word = trg_word.reshape(-1, trg_max_len, 1)
+    trg_pos = trg_pos.reshape(-1, trg_max_len, 1)
-    def __pad_batch_data(insts,
-                         pad_idx,
-                         is_target=False,
-                         return_pos=True,
-                         return_attn_bias=True,
-                         return_max_len=True):
-        """
-        Pad the instances to the max sequence length in batch, and generate the
-        corresponding position data and attention bias.
-        """
-        return_list = []
-        max_len = max(len(inst) for inst in insts)
-        inst_data = np.array(
-            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
-        return_list += [inst_data.astype("int64").reshape([-1, 1])]
-        if return_pos:
-            inst_pos = np.array([[
-                pos_i + 1 if w_i != pad_idx else 0
-                for pos_i, w_i in enumerate(inst)
-            ] for inst in inst_data])
-            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
-        if return_attn_bias:
-            if is_target:
-                # This is used to avoid attention on paddings and subsequent
-                # words.
-                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
-                                              max_len))
-                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
-                    [-1, 1, max_len, max_len])
-                slf_attn_bias_data = np.tile(slf_attn_bias_data,
-                                             [1, n_head, 1, 1]) * [-1e9]
-            else:
-                # This is used to avoid attention on paddings.
-                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
-                                               (max_len - len(inst))
-                                               for inst in insts])
-                slf_attn_bias_data = np.tile(
-                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
-                    [1, n_head, max_len, 1])
-            return_list += [slf_attn_bias_data.astype("float32")]
-        if return_max_len:
-            return_list += [max_len]
-        return return_list if len(return_list) > 1 else return_list[0]
-    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
-        [inst[0] for inst in insts], src_pad_idx, is_target=False)
-    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
-        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
                                [1, 1, trg_max_len, 1]).astype("float32")
-    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
-                                False, False, False)
-    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+    lbl_word, lbl_weight, num_token = pad_batch_data(
+        [inst[2] for inst in insts],
+        trg_pad_idx,
+        n_head,
+        is_target=False,
+        is_label=True,
+        return_attn_bias=False,
+        return_max_len=False,
+        return_num_token=True)
+    data_input_dict = dict(
+        zip(data_input_names, [
+            src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
+            trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+        ]))
+    return data_input_dict, np.asarray([num_token], dtype="float32")
+def read_multiple(reader, count, clip_last=True):
+    """
+    Stack data from reader for multi-devices.
+    """
+    def __impl__():
+        res = []
+        for item in reader():
+            res.append(item)
+            if len(res) == count:
+                yield res
+                res = []
+        if len(res) == count:
+            yield res
+        elif not clip_last:
+            data = []
+            for item in res:
+                data += item
+            if len(data) > count:
+                inst_num_per_part = len(data) // count
+                yield [
+                    data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
+                    for i in range(count)
+                ]
+    return __impl__
+def split_data(data, num_part):
+    """
+    Split data for each device.
+    """
+    if len(data) == num_part:
+        return data
+    data = data[0]
+    inst_num_per_part = len(data) // num_part
    return [
-        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
+        data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
-        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+        for i in range(num_part)
    ]
-def transformer(use_feed):
+def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names,
-    assert not use_feed, "transfomer doesn't support feed yet"
+                 sum_cost, token_num):
-    return transformer_model.transformer(
+    # Context to do validation.
-        ModelHyperParams.src_vocab_size + 1,
+    test_program = train_progm.clone()
-        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
+    with fluid.program_guard(test_program):
-        ModelHyperParams.n_layer, ModelHyperParams.n_head,
+        test_program = fluid.io.get_inference_program([avg_cost])
-        ModelHyperParams.d_key, ModelHyperParams.d_value,
-        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+    val_data = DataReader(
-        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
+        src_vocab_fpath=TrainTaskConfig.src_vocab_fpath,
-        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+        trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath,
+        fpattern=TrainTaskConfig.val_file_pattern,
+        token_delimiter=TrainTaskConfig.token_delimiter,
+        use_token_batch=TrainTaskConfig.use_token_batch,
+        batch_size=TrainTaskConfig.batch_size *
+        (1 if TrainTaskConfig.use_token_batch else dev_count),
+        pool_size=TrainTaskConfig.pool_size,
+        sort_type=TrainTaskConfig.sort_type,
+        start_mark=TrainTaskConfig.special_token[0],
+        end_mark=TrainTaskConfig.special_token[1],
+        unk_mark=TrainTaskConfig.special_token[2],
+        # count start and end tokens out
+        max_length=ModelHyperParams.max_length - 2,
+        clip_last_batch=False,
+        shuffle=False,
+        shuffle_batch=False)
+    build_strategy = fluid.BuildStrategy()
+    strategy = fluid.ExecutionStrategy()
+    strategy.num_threads = 1
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=TrainTaskConfig.use_gpu,
+        main_program=test_program,
+        share_vars_from=train_exe,
+        build_strategy=build_strategy,
+        exec_strategy=strategy)
+    def test(exe=test_exe):
+        test_total_cost = 0
+        test_total_token = 0
+        test_data = read_multiple(
+            reader=val_data.batch_generator,
+            count=dev_count if TrainTaskConfig.use_token_batch else 1)
+        for batch_id, data in enumerate(test_data()):
+            feed_list = []
+            for place_id, data_buffer in enumerate(
+                    split_data(
+                        data, num_part=dev_count)):
+                data_input_dict, _ = prepare_batch_input(
+                    data_buffer, data_input_names, ModelHyperParams.eos_idx,
+                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
+                    ModelHyperParams.d_model)
+                feed_list.append(data_input_dict)
+            outs = exe.run(feed=feed_list,
+                           fetch_list=[sum_cost.name, token_num.name])
+            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
+            test_total_cost += sum_cost_val.sum()
+            test_total_token += token_num_val.sum()
+        test_avg_cost = test_total_cost / test_total_token
+        test_ppl = np.exp([min(test_avg_cost, 100)])
+        return test_avg_cost, test_ppl
+    return test
+def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
+               token_num, predict):
+    # Initialize the parameters.
+    if TrainTaskConfig.ckpt_path:
+        lr_scheduler.current_steps = TrainTaskConfig.start_step
+    else:
+        exe.run(fluid.framework.default_startup_program())
+    train_data = DataReader(
+        src_vocab_fpath=TrainTaskConfig.src_vocab_fpath,
+        trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath,
+        fpattern=TrainTaskConfig.train_file_pattern,
+        token_delimiter=TrainTaskConfig.token_delimiter,
+        use_token_batch=TrainTaskConfig.use_token_batch,
+        batch_size=TrainTaskConfig.batch_size *
+        (1 if TrainTaskConfig.use_token_batch else dev_count),
+        pool_size=TrainTaskConfig.pool_size,
+        sort_type=TrainTaskConfig.sort_type,
+        shuffle=TrainTaskConfig.shuffle,
+        shuffle_batch=TrainTaskConfig.shuffle_batch,
+        start_mark=TrainTaskConfig.special_token[0],
+        end_mark=TrainTaskConfig.special_token[1],
+        unk_mark=TrainTaskConfig.special_token[2],
+        # count start and end tokens out
+        max_length=ModelHyperParams.max_length - 2,
+        clip_last_batch=False)
+    train_data = read_multiple(
+        reader=train_data.batch_generator,
+        count=dev_count if TrainTaskConfig.use_token_batch else 1)
+    build_strategy = fluid.BuildStrategy()
+    # Since the token number differs among devices, customize gradient scale to
+    # use token average cost among multi-devices. and the gradient scale is
+    # `1 / token_number` for average cost.
+    build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
+    strategy = fluid.ExecutionStrategy()
+    strategy.num_threads = 1
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=TrainTaskConfig.use_gpu,
+        loss_name=sum_cost.name,
+        main_program=train_progm,
+        build_strategy=build_strategy,
+        exec_strategy=strategy)
+    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
+                                                                             -1] + label_data_input_fields
+    if TrainTaskConfig.val_file_pattern is not None:
+        test = test_context(train_progm, avg_cost, train_exe, dev_count,
+                            data_input_names, sum_cost, token_num)
+    # the best cross-entropy value with label smoothing
+    loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log(
+        (1. - TrainTaskConfig.label_smooth_eps
+         )) + TrainTaskConfig.label_smooth_eps *
+                        np.log(TrainTaskConfig.label_smooth_eps / (
+                            ModelHyperParams.trg_vocab_size - 1) + 1e-20))
+    init = False
+    for pass_id in xrange(TrainTaskConfig.pass_num):
+        pass_start_time = time.time()
+        for batch_id, data in enumerate(train_data()):
+            if batch_id >= 5:
+                break
+            feed_list = []
+            total_num_token = 0
+            #if TrainTaskConfig.local:
+            #    lr_rate = lr_scheduler.update_learning_rate()
+            #for place_id, data_buffer in enumerate(
+            #        split_data(
+            #            data, num_part=dev_count)):
+            if TrainTaskConfig.local:
+                lr_rate = lr_scheduler.update_learning_rate()
+            for place_id, data_buffer in enumerate(
+                    split_data(
+                        data, num_part=dev_count)):
+                data_input_dict, num_token = prepare_batch_input(
+                    data_buffer, data_input_names, ModelHyperParams.eos_idx,
+                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
+                    ModelHyperParams.d_model)
+                total_num_token += num_token
+                feed_kv_pairs = data_input_dict.items()
+                if TrainTaskConfig.local:
+                    feed_kv_pairs += {
+                        lr_scheduler.learning_rate.name: lr_rate
+                    }.items()
+                feed_list.append(dict(feed_kv_pairs))
+                if not init:
+                    for pos_enc_param_name in pos_enc_param_names:
+                        pos_enc = position_encoding_init(
+                            ModelHyperParams.max_length + 1,
+                            ModelHyperParams.d_model)
+                        feed_list[place_id][pos_enc_param_name] = pos_enc
+            if not TrainTaskConfig.check_acc:
+                for feed_dict in feed_list:
+                    feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token
+            else:
+                b = 100 * TrainTaskConfig.batch_size
+                a = np.asarray([b], dtype="float32")
+                for feed_dict in feed_list:
+                    feed_dict[sum_cost.name + "@GRAD"] = 1. / a
+            outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name],
+                                 feed=feed_list)
+            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
+            total_sum_cost = sum_cost_val.sum()
+            total_token_num = token_num_val.sum()
+            total_avg_cost = total_sum_cost / total_token_num
+            init = True
+            # Validate and save the model for inference.
+            if TrainTaskConfig.val_file_pattern is not None:
+                val_avg_cost, val_ppl = test()
+                print("[%f]" % val_avg_cost)
+            else:
+                assert (False)
+#import transformer_reader as reader
+class SortType(object):
+    GLOBAL = 'global'
+    POOL = 'pool'
+    NONE = "none"
+class Converter(object):
+    def __init__(self, vocab, beg, end, unk, delimiter):
+        self._vocab = vocab
+        self._beg = beg
+        self._end = end
+        self._unk = unk
+        self._delimiter = delimiter
+    def __call__(self, sentence):
+        return [self._beg] + [
+            self._vocab.get(w, self._unk)
+            for w in sentence.split(self._delimiter)
+        ] + [self._end]
+class ComposedConverter(object):
+    def __init__(self, converters):
+        self._converters = converters
+    def __call__(self, parallel_sentence):
+        return [
+            self._converters[i](parallel_sentence[i])
+            for i in range(len(self._converters))
+        ]
+class SentenceBatchCreator(object):
+    def __init__(self, batch_size):
+        self.batch = []
+        self._batch_size = batch_size
+    def append(self, info):
+        self.batch.append(info)
+        if len(self.batch) == self._batch_size:
+            tmp = self.batch
+            self.batch = []
+            return tmp
+class TokenBatchCreator(object):
+    def __init__(self, batch_size):
+        self.batch = []
+        self.max_len = -1
+        self._batch_size = batch_size
+    def append(self, info):
+        cur_len = info.max_len
+        max_len = max(self.max_len, cur_len)
+        if max_len * (len(self.batch) + 1) > self._batch_size:
+            result = self.batch
+            self.batch = [info]
+            self.max_len = cur_len
+            return result
+        else:
+            self.max_len = max_len
+            self.batch.append(info)
+class SampleInfo(object):
+    def __init__(self, i, max_len, min_len):
+        self.i = i
+        self.min_len = min_len
+        self.max_len = max_len
+class MinMaxFilter(object):
+    def __init__(self, max_len, min_len, underlying_creator):
+        self._min_len = min_len
+        self._max_len = max_len
+        self._creator = underlying_creator
+    def append(self, info):
+        if info.max_len > self._max_len or info.min_len < self._min_len:
+            return
+        else:
+            return self._creator.append(info)
+    @property
+    def batch(self):
+        return self._creator.batch
+class DataReader(object):
+    """
+    The data reader loads all data from files and produces batches of data
+    in the way corresponding to settings.
+    An example of returning a generator producing data batches whose data
+    is shuffled in each pass and sorted in each pool:
+    ```
+    train_data = DataReader(
+        src_vocab_fpath='data/src_vocab_file',
+        trg_vocab_fpath='data/trg_vocab_file',
+        fpattern='data/part-*',
+        use_token_batch=True,
+        batch_size=2000,
+        pool_size=10000,
+        sort_type=SortType.POOL,
+        shuffle=True,
+        shuffle_batch=True,
+        start_mark='<s>',
+        end_mark='<e>',
+        unk_mark='<unk>',
+        clip_last_batch=False).batch_generator
+    ```
+    :param src_vocab_fpath: The path of vocabulary file of source language.
+    :type src_vocab_fpath: basestring
+    :param trg_vocab_fpath: The path of vocabulary file of target language.
+    :type trg_vocab_fpath: basestring
+    :param fpattern: The pattern to match data files.
+    :type fpattern: basestring
+    :param batch_size: The number of sequences contained in a mini-batch.
+        or the maximum number of tokens (include paddings) contained in a
+        mini-batch.
+    :type batch_size: int
+    :param pool_size: The size of pool buffer.
+    :type pool_size: int
+    :param sort_type: The grain to sort by length: 'global' for all
+        instances; 'pool' for instances in pool; 'none' for no sort.
+    :type sort_type: basestring
+    :param clip_last_batch: Whether to clip the last uncompleted batch.
+    :type clip_last_batch: bool
+    :param tar_fname: The data file in tar if fpattern matches a tar file.
+    :type tar_fname: basestring
+    :param min_length: The minimum length used to filt sequences.
+    :type min_length: int
+    :param max_length: The maximum length used to filt sequences.
+    :type max_length: int
+    :param shuffle: Whether to shuffle all instances.
+    :type shuffle: bool
+    :param shuffle_batch: Whether to shuffle the generated batches.
+    :type shuffle_batch: bool
+    :param use_token_batch: Whether to produce batch data according to
+        token number.
+    :type use_token_batch: bool
+    :param field_delimiter: The delimiter used to split source and target in
+        each line of data file.
+    :type field_delimiter: basestring
+    :param token_delimiter: The delimiter used to split tokens in source or
+        target sentences.
+    :type token_delimiter: basestring
+    :param start_mark: The token representing for the beginning of
+        sentences in dictionary.
+    :type start_mark: basestring
+    :param end_mark: The token representing for the end of sentences
+        in dictionary.
+    :type end_mark: basestring
+    :param unk_mark: The token representing for unknown word in dictionary.
+    :type unk_mark: basestring
+    :param seed: The seed for random.
+    :type seed: int
+    """
+    def __init__(self,
+                 src_vocab_fpath,
+                 trg_vocab_fpath,
+                 fpattern,
+                 batch_size,
+                 pool_size,
+                 sort_type=SortType.GLOBAL,
+                 clip_last_batch=True,
+                 tar_fname=None,
+                 min_length=0,
+                 max_length=100,
+                 shuffle=True,
+                 shuffle_batch=False,
+                 use_token_batch=False,
+                 field_delimiter="\t",
+                 token_delimiter=" ",
+                 start_mark="<s>",
+                 end_mark="<e>",
+                 unk_mark="<unk>",
+                 seed=0):
+        self._src_vocab = self.load_dict(src_vocab_fpath)
+        self._only_src = True
+        if trg_vocab_fpath is not None:
+            self._trg_vocab = self.load_dict(trg_vocab_fpath)
+            self._only_src = False
+        self._pool_size = pool_size
+        self._batch_size = batch_size
+        self._use_token_batch = use_token_batch
+        self._sort_type = sort_type
+        self._clip_last_batch = clip_last_batch
+        self._shuffle = shuffle
+        self._shuffle_batch = shuffle_batch
+        self._min_length = min_length
+        self._max_length = max_length
+        self._field_delimiter = field_delimiter
+        self._token_delimiter = token_delimiter
+        self.load_src_trg_ids(end_mark, fpattern, start_mark, tar_fname,
+                              unk_mark)
+        self._random = random.Random(x=seed)
+    def load_src_trg_ids(self, end_mark, fpattern, start_mark, tar_fname,
+                         unk_mark):
+        converters = [
+            Converter(
+                vocab=self._src_vocab,
+                beg=self._src_vocab[start_mark],
+                end=self._src_vocab[end_mark],
+                unk=self._src_vocab[unk_mark],
+                delimiter=self._token_delimiter)
+        ]
+        if not self._only_src:
+            converters.append(
+                Converter(
+                    vocab=self._trg_vocab,
+                    beg=self._trg_vocab[start_mark],
+                    end=self._trg_vocab[end_mark],
+                    unk=self._trg_vocab[unk_mark],
+                    delimiter=self._token_delimiter))
+        converters = ComposedConverter(converters)
+        self._src_seq_ids = []
+        self._trg_seq_ids = None if self._only_src else []
+        self._sample_infos = []
+        for i, line in enumerate(self._load_lines(fpattern, tar_fname)):
+            src_trg_ids = converters(line)
+            self._src_seq_ids.append(src_trg_ids[0])
+            lens = [len(src_trg_ids[0])]
+            if not self._only_src:
+                self._trg_seq_ids.append(src_trg_ids[1])
+                lens.append(len(src_trg_ids[1]))
+            self._sample_infos.append(SampleInfo(i, max(lens), min(lens)))
+    def _load_lines(self, fpattern, tar_fname):
+        fpaths = glob.glob(fpattern)
+        if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]):
+            if tar_fname is None:
+                raise Exception("If tar file provided, please set tar_fname.")
+            f = tarfile.open(fpaths[0], "r")
+            for line in f.extractfile(tar_fname):
+                fields = line.strip("\n").split(self._field_delimiter)
+                if (not self._only_src and len(fields) == 2) or (
+                        self._only_src and len(fields) == 1):
+                    yield fields
+        else:
+            for fpath in fpaths:
+                if not os.path.isfile(fpath):
+                    raise IOError("Invalid file: %s" % fpath)
+                with open(fpath, "r") as f:
+                    for line in f:
+                        fields = line.strip("\n").split(self._field_delimiter)
+                        if (not self._only_src and len(fields) == 2) or (
+                                self._only_src and len(fields) == 1):
+                            yield fields
+    @staticmethod
+    def load_dict(dict_path, reverse=False):
+        word_dict = {}
+        with open(dict_path, "r") as fdict:
+            for idx, line in enumerate(fdict):
+                if reverse:
+                    word_dict[idx] = line.strip("\n")
+                else:
+                    word_dict[line.strip("\n")] = idx
+        return word_dict
+    def batch_generator(self):
+        # global sort or global shuffle
+        if self._sort_type == SortType.GLOBAL:
+            infos = sorted(
+                self._sample_infos, key=lambda x: x.max_len, reverse=True)
+        else:
+            if self._shuffle:
+                infos = self._sample_infos
+                self._random.shuffle(infos)
+            else:
+                infos = self._sample_infos
+            if self._sort_type == SortType.POOL:
+                for i in range(0, len(infos), self._pool_size):
+                    infos[i:i + self._pool_size] = sorted(
+                        infos[i:i + self._pool_size], key=lambda x: x.max_len)
+        # concat batch
+        batches = []
+        batch_creator = TokenBatchCreator(
+            self._batch_size
+        ) if self._use_token_batch else SentenceBatchCreator(self._batch_size)
+        batch_creator = MinMaxFilter(self._max_length, self._min_length,
+                                     batch_creator)
+        for info in infos:
+            batch = batch_creator.append(info)
+            if batch is not None:
+                batches.append(batch)
+        if not self._clip_last_batch and len(batch_creator.batch) != 0:
+            batches.append(batch_creator.batch)
+        if self._shuffle_batch:
+            self._random.shuffle(batches)
+        for batch in batches:
+            batch_ids = [info.i for info in batch]
+            if self._only_src:
+                yield [[self._src_seq_ids[idx]] for idx in batch_ids]
+            else:
+                yield [(self._src_seq_ids[idx], self._trg_seq_ids[idx][:-1],
+                        self._trg_seq_ids[idx][1:]) for idx in batch_ids]
+#from transformer_model import transformer
+def position_encoding_init(n_position, d_pos_vec):
+    """
+    Generate the initial values for the sinusoid position encoding table.
+    """
+    position_enc = np.array([[
+        pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
+        for j in range(d_pos_vec)
+    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
+    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
+    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
+    return position_enc.astype("float32")
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         cache=None):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=const_para_attr,
+                      bias_attr=const_bias_attr)
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=const_para_attr,
+                      bias_attr=const_bias_attr)
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      num_flatten_dims=2,
+                      param_attr=const_para_attr,
+                      bias_attr=const_bias_attr)
+        return q, k, v
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        if n_head == 1:
+            return x
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = layers.reshape(
+            x=x, shape=[0, 0, n_head, hidden_size // n_head])
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        return layers.reshape(
+            x=trans_x,
+            shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]]))
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=dropout_rate,
+                seed=ModelHyperParams.dropout_seed,
+                is_test=False)
+        out = layers.matmul(weights, v)
+        return out
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+    if cache is not None:  # use cache and concat time steps
+        k = cache["k"] = layers.concat([cache["k"], k], axis=1)
+        v = cache["v"] = layers.concat([cache["v"], v], axis=1)
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
+                                                  dropout_rate)
+    out = __combine_heads(ctx_multiheads)
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         num_flatten_dims=2,
+                         param_attr=const_para_attr,
+                         bias_attr=const_bias_attr)
+    return proj_out
+def positionwise_feed_forward(x, d_inner_hid, d_hid):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       act="relu",
+                       param_attr=const_para_attr,
+                       bias_attr=const_bias_attr)
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=2,
+                    param_attr=const_para_attr,
+                    bias_attr=const_bias_attr)
+    return out
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.initializer.Constant(1.),
+                bias_attr=fluid.initializer.Constant(0.))
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out,
+                    dropout_prob=dropout_rate,
+                    seed=ModelHyperParams.dropout_seed,
+                    is_test=False)
+    return out
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+def prepare_encoder(src_word,
+                    src_pos,
+                    src_vocab_size,
+                    src_emb_dim,
+                    src_max_len,
+                    dropout_rate=0.,
+                    word_emb_param_name=None,
+                    pos_enc_param_name=None):
+    """Add word embeddings and position encodings.
+    The output tensor has a shape of:
+    [batch_size, max_src_length_in_batch, d_model].
+    This module is used at the bottom of the encoder stacks.
+    """
+    if TrainTaskConfig.check_acc:
+        src_word_emb = layers.embedding(
+            src_word,
+            size=[src_vocab_size, src_emb_dim],
+            param_attr=fluid.ParamAttr(
+                name=word_emb_param_name,
+                initializer=fluid.initializer.ConstantInitializer(0.001)))
+    else:
+        src_word_emb = layers.embedding(
+            src_word,
+            size=[src_vocab_size, src_emb_dim],
+            param_attr=fluid.ParamAttr(
+                name=word_emb_param_name,
+                initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
+    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
+    src_pos_enc = layers.embedding(
+        src_pos,
+        size=[src_max_len, src_emb_dim],
+        param_attr=fluid.ParamAttr(
+            name=pos_enc_param_name,
+            trainable=False,
+            initializer=fluid.initializer.ConstantInitializer(0.001)))
+    enc_input = src_word_emb + src_pos_enc
+    return layers.dropout(
+        enc_input,
+        dropout_prob=dropout_rate,
+        seed=ModelHyperParams.dropout_seed,
+        is_test=False) if dropout_rate else enc_input
+prepare_encoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[0])
+prepare_decoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[1])
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(enc_input, enc_input, enc_input,
+                                       attn_bias, d_key, d_value, d_model,
+                                       n_head, dropout_rate)
+    attn_output = post_process_layer(enc_input, attn_output, "dan",
+                                     dropout_rate)
+    ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
+    return post_process_layer(attn_output, ffd_output, "dan", dropout_rate)
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value,
+                                   d_model, d_inner_hid, dropout_rate)
+        enc_input = enc_output
+    return enc_output
+def decoder_layer(dec_input,
+                  enc_output,
+                  slf_attn_bias,
+                  dec_enc_attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.,
+                  cache=None):
+    """ The layer to be stacked in decoder part.
+    The structure of this module is similar to that in the encoder part except
+    a multi-head attention is added to implement encoder-decoder attention.
+    """
+    slf_attn_output = multi_head_attention(
+        dec_input,
+        dec_input,
+        dec_input,
+        slf_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate,
+        cache, )
+    slf_attn_output = post_process_layer(
+        dec_input,
+        slf_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    enc_attn_output = multi_head_attention(
+        slf_attn_output,
+        enc_output,
+        enc_output,
+        dec_enc_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate, )
+    enc_attn_output = post_process_layer(
+        slf_attn_output,
+        enc_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    ffd_output = positionwise_feed_forward(
+        enc_attn_output,
+        d_inner_hid,
+        d_model, )
+    dec_output = post_process_layer(
+        enc_attn_output,
+        ffd_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    return dec_output
-def get_model():
+def decoder(dec_input,
-    avg_cost = transformer(use_feed=False)
+            enc_output,
-    optimizer = fluid.optimizer.Adam()
+            dec_slf_attn_bias,
-    optimizer.minimize(avg_cost)
+            dec_enc_attn_bias,
-    fluid.memory_optimize(fluid.default_main_program())
+            n_layer,
-    return avg_cost
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.,
+            caches=None):
+    """
+    The decoder is composed of a stack of identical decoder_layer layers.
+    """
+    for i in range(n_layer):
+        cache = None
+        if caches is not None:
+            cache = caches[i]
+        dec_output = decoder_layer(
+            dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate,
+            cache=cache)
+        dec_input = dec_output
+    return dec_output
+def make_all_inputs(input_fields):
+    """
+    Define the input data layers for the transformer model.
+    """
+    inputs = []
+    for input_field in input_fields:
+        input_var = layers.data(
+            name=input_field,
+            shape=input_descs[input_field][0],
+            dtype=input_descs[input_field][1],
+            lod_level=input_descs[input_field][2]
+            if len(input_descs[input_field]) == 3 else 0,
+            append_batch_size=False)
+        inputs.append(input_var)
+    return inputs
+def transformer(
+        src_vocab_size,
+        trg_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        weight_sharing,
+        label_smooth_eps, ):
+    if weight_sharing:
+        assert src_vocab_size == src_vocab_size, (
+            "Vocabularies in source and target should be same for weight sharing."
+        )
+    enc_inputs = make_all_inputs(encoder_data_input_fields)
+    enc_output = wrap_encoder(
+        src_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        weight_sharing,
+        enc_inputs, )
+    dec_inputs = make_all_inputs(decoder_data_input_fields[:-1])
+    predict = wrap_decoder(
+        trg_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        weight_sharing,
+        dec_inputs,
+        enc_output, )
+    # Padding index do not contribute to the total loss. The weights is used to
+    # cancel padding index in calculating the loss.
+    label, weights = make_all_inputs(label_data_input_fields)
+    if label_smooth_eps:
+        label = layers.label_smooth(
+            label=layers.one_hot(
+                input=label, depth=trg_vocab_size),
+            epsilon=label_smooth_eps)
+    cost = layers.softmax_with_cross_entropy(
+        logits=layers.reshape(
+            predict, shape=[-1, trg_vocab_size]),
+        label=label,
+        soft_label=True if label_smooth_eps else False)
+    weighted_cost = cost * weights
+    sum_cost = layers.reduce_sum(weighted_cost)
+    token_num = layers.reduce_sum(weights)
+    avg_cost = sum_cost / token_num
+    avg_cost.stop_gradient = True
+    return sum_cost, avg_cost, predict, token_num
+def wrap_encoder(src_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 dropout_rate,
+                 weight_sharing,
+                 enc_inputs=None):
+    """
+    The wrapper assembles together all needed layers for the encoder.
+    """
+    if enc_inputs is None:
+        # This is used to implement independent encoder program in inference.
+        src_word, src_pos, src_slf_attn_bias = \
+            make_all_inputs(encoder_data_input_fields)
+    else:
+        src_word, src_pos, src_slf_attn_bias = \
+            enc_inputs
+    enc_input = prepare_encoder(
+        src_word,
+        src_pos,
+        src_vocab_size,
+        d_model,
+        max_length,
+        dropout_rate,
+        word_emb_param_name=word_emb_param_names[0])
+    enc_output = encoder(enc_input, src_slf_attn_bias, n_layer, n_head, d_key,
+                         d_value, d_model, d_inner_hid, dropout_rate)
+    return enc_output
+def wrap_decoder(trg_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 dropout_rate,
+                 weight_sharing,
+                 dec_inputs=None,
+                 enc_output=None,
+                 caches=None):
+    """
+    The wrapper assembles together all needed layers for the decoder.
+    """
+    if dec_inputs is None:
+        # This is used to implement independent decoder program in inference.
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
+        enc_output = make_all_inputs(
+            decoder_data_input_fields + decoder_util_input_fields)
+    else:
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
+    dec_input = prepare_decoder(
+        trg_word,
+        trg_pos,
+        trg_vocab_size,
+        d_model,
+        max_length,
+        dropout_rate,
+        word_emb_param_name=word_emb_param_names[0]
+        if weight_sharing else word_emb_param_names[1])
+    dec_output = decoder(
+        dec_input,
+        enc_output,
+        trg_slf_attn_bias,
+        trg_src_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        caches=caches)
+    # Return logits for training and probs for inference.
+    if weight_sharing:
+        predict = layers.matmul(
+            x=dec_output,
+            y=fluid.get_var(word_emb_param_names[0]),
+            transpose_y=True)
+    else:
+        predict = layers.fc(input=dec_output,
+                            size=trg_vocab_size,
+                            num_flatten_dims=2,
+                            param_attr=const_para_attr,
+                            bias_attr=const_bias_attr)
+    if dec_inputs is None:
+        predict = layers.softmax(predict)
+    return predict
+def fast_decode(
+        src_vocab_size,
+        trg_vocab_size,
+        max_in_len,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        weight_sharing,
+        beam_size,
+        max_out_len,
+        eos_idx, ):
+    """
+    Use beam search to decode. Caches will be used to store states of history
+    steps which can make the decoding faster.
+    """
+    enc_output = wrap_encoder(src_vocab_size, max_in_len, n_layer, n_head,
+                              d_key, d_value, d_model, d_inner_hid,
+                              dropout_rate, weight_sharing)
+    start_tokens, init_scores, trg_src_attn_bias = \
+        make_all_inputs(fast_decoder_data_input_fields )
+    def beam_search():
+        max_len = layers.fill_constant(
+            shape=[1], dtype=start_tokens.dtype, value=max_out_len)
+        step_idx = layers.fill_constant(
+            shape=[1], dtype=start_tokens.dtype, value=0)
+        cond = layers.less_than(x=step_idx, y=max_len)
+        while_op = layers.While(cond)
+        # array states will be stored for each step.
+        ids = layers.array_write(
+            layers.reshape(start_tokens, (-1, 1)), step_idx)
+        scores = layers.array_write(init_scores, step_idx)
+        # cell states will be overwrited at each step.
+        # caches contains states of history steps to reduce redundant
+        # computation in decoder.
+        caches = [{
+            "k": layers.fill_constant_batch_size_like(
+                input=start_tokens,
+                shape=[-1, 0, d_model],
+                dtype=enc_output.dtype,
+                value=0),
+            "v": layers.fill_constant_batch_size_like(
+                input=start_tokens,
+                shape=[-1, 0, d_model],
+                dtype=enc_output.dtype,
+                value=0)
+        } for i in range(n_layer)]
+        with while_op.block():
+            pre_ids = layers.array_read(array=ids, i=step_idx)
+            pre_ids = layers.reshape(pre_ids, (-1, 1, 1))
+            pre_scores = layers.array_read(array=scores, i=step_idx)
+            # sequence_expand can gather sequences according to lod thus can be
+            # used in beam search to sift states corresponding to selected ids.
+            pre_src_attn_bias = layers.sequence_expand(
+                x=trg_src_attn_bias, y=pre_scores)
+            pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores)
+            pre_caches = [{
+                "k": layers.sequence_expand(
+                    x=cache["k"], y=pre_scores),
+                "v": layers.sequence_expand(
+                    x=cache["v"], y=pre_scores),
+            } for cache in caches]
+            pre_pos = layers.elementwise_mul(
+                x=layers.fill_constant_batch_size_like(
+                    input=pre_enc_output,  # cann't use pre_ids here since it has lod
+                    value=1,
+                    shape=[-1, 1, 1],
+                    dtype=pre_ids.dtype),
+                y=layers.increment(
+                    x=step_idx, value=1.0, in_place=False),
+                axis=0)
+            logits = wrap_decoder(
+                trg_vocab_size,
+                max_in_len,
+                n_layer,
+                n_head,
+                d_key,
+                d_value,
+                d_model,
+                d_inner_hid,
+                dropout_rate,
+                weight_sharing,
+                dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias),
+                enc_output=pre_enc_output,
+                caches=pre_caches)
+            logits = layers.reshape(logits, (-1, trg_vocab_size))
+            topk_scores, topk_indices = layers.topk(
+                input=layers.softmax(logits), k=beam_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(topk_scores),
+                y=layers.reshape(
+                    pre_scores, shape=[-1]),
+                axis=0)
+            # beam_search op uses lod to distinguish branches.
+            topk_indices = layers.lod_reset(topk_indices, pre_ids)
+            selected_ids, selected_scores = layers.beam_search(
+                pre_ids=pre_ids,
+                pre_scores=pre_scores,
+                ids=topk_indices,
+                scores=accu_scores,
+                beam_size=beam_size,
+                end_id=eos_idx)
+            layers.increment(x=step_idx, value=1.0, in_place=True)
+            # update states
+            layers.array_write(selected_ids, i=step_idx, array=ids)
+            layers.array_write(selected_scores, i=step_idx, array=scores)
+            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
+            layers.assign(pre_enc_output, enc_output)
+            for i in range(n_layer):
+                layers.assign(pre_caches[i]["k"], caches[i]["k"])
+                layers.assign(pre_caches[i]["v"], caches[i]["v"])
+            length_cond = layers.less_than(x=step_idx, y=max_len)
+            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
+            layers.logical_and(x=length_cond, y=finish_cond, out=cond)
+        finished_ids, finished_scores = layers.beam_search_decode(
+            ids, scores, beam_size=beam_size, end_id=eos_idx)
+        return finished_ids, finished_scores
+    finished_ids, finished_scores = beam_search()
+    return finished_ids, finished_scores
+def get_model(is_dist, is_async):
+    sum_cost, avg_cost, predict, token_num = transformer(
+        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
+        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
+        ModelHyperParams.n_head, ModelHyperParams.d_key,
+        ModelHyperParams.d_value, ModelHyperParams.d_model,
+        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
+        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
+    local_lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
+                                               TrainTaskConfig.warmup_steps,
+                                               TrainTaskConfig.learning_rate)
+    if not is_dist:
+        optimizer = fluid.optimizer.Adam(
+            learning_rate=local_lr_scheduler.learning_rate,
+            beta1=TrainTaskConfig.beta1,
+            beta2=TrainTaskConfig.beta2,
+            epsilon=TrainTaskConfig.eps)
+        optimizer.minimize(sum_cost)
+    elif is_async:
+        optimizer = fluid.optimizer.SGD(0.003)
+        optimizer.minimize(sum_cost)
+    else:
+        lr_decay = fluid.layers\
+         .learning_rate_scheduler\
+         .noam_decay(ModelHyperParams.d_model,
+            TrainTaskConfig.warmup_steps)
+        optimizer = fluid.optimizer.Adam(
+            learning_rate=lr_decay,
+            beta1=TrainTaskConfig.beta1,
+            beta2=TrainTaskConfig.beta2,
+            epsilon=TrainTaskConfig.eps)
+        optimizer.minimize(sum_cost)
+    return sum_cost, avg_cost, predict, token_num, local_lr_scheduler
 def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
@@ -176,10 +1677,23 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
    return t
-class DistTransformer2x2(object):
+def update_args():
+    src_dict = DataReader.load_dict(TrainTaskConfig.src_vocab_fpath)
+    trg_dict = DataReader.load_dict(TrainTaskConfig.trg_vocab_fpath)
+    dict_args = [
+        "src_vocab_size", str(len(src_dict)), "trg_vocab_size",
+        str(len(trg_dict)), "bos_idx",
+        str(src_dict[TrainTaskConfig.special_token[0]]), "eos_idx",
+        str(src_dict[TrainTaskConfig.special_token[1]]), "unk_idx",
+        str(src_dict[TrainTaskConfig.special_token[2]])
+    ]
+    merge_cfg_from_list(dict_args, [TrainTaskConfig, ModelHyperParams])
+class DistTransformer2x2(TestDistRunnerBase):
    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
-                    trainer_id):
+                    trainer_id, sync_mode):
-        get_model()
+        get_model(True, not sync_mode)
        t = get_transpiler(trainer_id,
                           fluid.default_main_program(), pserver_endpoints,
                           trainers)
@@ -196,7 +1710,6 @@ class DistTransformer2x2(object):
        while True:
            assert retry_times >= 0, "wait ps ready failed"
            time.sleep(3)
-            print("waiting ps ready: ", pid)
            try:
                # the listen_and_serv_op would touch a file which contains the listen port
                # on the /tmp directory until it was ready to process all the RPC call.
@@ -205,63 +1718,35 @@ class DistTransformer2x2(object):
            except os.error:
                retry_times -= 1
-    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+    def run_trainer(self,
-        avg_cost = get_model()
+                    place,
+                    endpoints,
+                    trainer_id,
+                    trainers,
+                    is_dist=True,
+                    sync_mode=True):
+        sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
+            is_dist, not sync_mode)
        if is_dist:
            t = get_transpiler(trainer_id,
                               fluid.default_main_program(), endpoints,
                               trainers)
            trainer_prog = t.get_trainer_program()
+            TrainTaskConfig.batch_size = 10
+            TrainTaskConfig.train_file_pattern = TrainTaskConfig.data_path + "train.tok.clean.bpe.32000.en-de.train_{}".format(
+                trainer_id)
        else:
+            TrainTaskConfig.batch_size = 20
            trainer_prog = fluid.default_main_program()
        startup_exe = fluid.Executor(place)
-        startup_exe.run(fluid.default_startup_program())
+        TrainTaskConfig.local = not is_dist
-        strategy = fluid.ExecutionStrategy()
-        strategy.num_threads = 1
+        train_loop(startup_exe, trainer_prog, 1, sum_cost, avg_cost,
-        strategy.allow_op_delay = False
+                   local_lr_scheduler, token_num, predict)
-        exe = fluid.ParallelExecutor(
-            True, loss_name=avg_cost.name, exec_strategy=strategy)
-        first_loss, = exe.run(fetch_list=[avg_cost.name])
-        print(first_loss)
-        for i in six.moves.xrange(5):
-            _ = exe.run(fetch_list=[avg_cost.name])
-        last_loss, = exe.run(fetch_list=[avg_cost.name])
-        print(last_loss)
-def main(role="pserver",
-         endpoints="127.0.0.1:9123",
-         trainer_id=0,
-         current_endpoint="127.0.0.1:9123",
-         trainers=1,
-         is_dist=True):
-    reader = paddle.batch(
-        wmt16.train(ModelHyperParams.src_vocab_size,
-                    ModelHyperParams.trg_vocab_size),
-        batch_size=transformer_model.batch_size)
-    with fluid.recordio_writer.create_recordio_writer(
-            WMT16_RECORDIO_FILE) as writer:
-        for batch in reader():
-            for tensor in prepare_batch_input(
-                    batch, ModelHyperParams.src_pad_idx,
-                    ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
-                t = fluid.LoDTensor()
-                t.set(tensor, fluid.CPUPlace())
-                writer.append_tensor(t)
-            writer.complete_append_tensor()
-    model = DistTransformer2x2()
-    if role == "pserver":
-        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
-    else:
-        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
 if __name__ == "__main__":
@@ -269,18 +1754,6 @@ if __name__ == "__main__":
        print(
            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
        )
-    role = sys.argv[1]
-    endpoints = sys.argv[2]
+    update_args()
-    trainer_id = int(sys.argv[3])
+    runtime_main(DistTransformer2x2)
-    current_endpoint = sys.argv[4]
-    trainers = int(sys.argv[5])
-    is_dist = True if sys.argv[6] == "TRUE" else False
-    # FIXME(typhoonzero): refine this test.
-    is_async = True if sys.argv[7] == "TRUE" else False
-    main(
-        role=role,
-        endpoints=endpoints,
-        trainer_id=trainer_id,
-        current_endpoint=current_endpoint,
-        trainers=trainers,
-        is_dist=is_dist)
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
@@ -49,28 +49,32 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
                dtype='float32',
                is_sparse=IS_SPARSE,
                param_attr=fluid.ParamAttr(
-                    name='shared_w', initializer=fluid.initializer.Constant()))
+                    name='shared_w',
+                    initializer=fluid.initializer.Constant(value=0.1)))
            embed_second = fluid.layers.embedding(
                input=words[1],
                size=[dict_size, EMBED_SIZE],
                dtype='float32',
                is_sparse=IS_SPARSE,
                param_attr=fluid.ParamAttr(
-                    name='shared_w', initializer=fluid.initializer.Constant()))
+                    name='shared_w',
+                    initializer=fluid.initializer.Constant(value=0.1)))
            embed_third = fluid.layers.embedding(
                input=words[2],
                size=[dict_size, EMBED_SIZE],
                dtype='float32',
                is_sparse=IS_SPARSE,
                param_attr=fluid.ParamAttr(
-                    name='shared_w', initializer=fluid.initializer.Constant()))
+                    name='shared_w',
+                    initializer=fluid.initializer.Constant(value=0.1)))
            embed_forth = fluid.layers.embedding(
                input=words[3],
                size=[dict_size, EMBED_SIZE],
                dtype='float32',
                is_sparse=IS_SPARSE,
                param_attr=fluid.ParamAttr(
-                    name='shared_w', initializer=fluid.initializer.Constant()))
+                    name='shared_w',
+                    initializer=fluid.initializer.Constant(value=0.1)))
            concat_embed = fluid.layers.concat(
                input=[embed_first, embed_second, embed_third, embed_forth],
@@ -80,13 +84,13 @@ class TestDistWord2vec2x2(TestDistRunnerBase):
                size=HIDDEN_SIZE,
                act='sigmoid',
                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant()))
+                    initializer=fluid.initializer.Constant(value=0.1)))
            predict_word = fluid.layers.fc(
                input=hidden1,
                size=dict_size,
                act='softmax',
                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant()))
+                    initializer=fluid.initializer.Constant(value=0.1)))
            cost = fluid.layers.cross_entropy(
                input=predict_word, label=words[4])
            avg_cost = fluid.layers.mean(cost)

--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -100,7 +100,7 @@ class TestSendOp(unittest.TestCase):
            main.global_block().append_op(
                type="fetch_barrier",
                inputs={},
-                outputs={},
+                outputs={"Out": []},
                attrs={
                    "endpoints": ["127.0.0.1:{0}".format(port)],
                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE

--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -15,17 +15,55 @@
 from __future__ import print_function
 import unittest
+import paddle
 from test_dist_base import TestDistBase
-class TestDistTransformer2x2(TestDistBase):
+def download_files():
+    url_prefix = 'http://paddle-unittest-data.cdn.bcebos.com/dist_transformer/'
+    vocab_url = url_prefix + 'vocab.bpe.32000'
+    vocab_md5 = 'a86d345ca6e27f6591d0dccb1b9be853'
+    paddle.dataset.common.download(vocab_url, 'test_dist_transformer',
+                                   vocab_md5)
+    local_train_url = url_prefix + 'train.tok.clean.bpe.32000.en-de'
+    local_train_md5 = '033eb02b9449e6dd823f050782ac8914'
+    paddle.dataset.common.download(local_train_url, 'test_dist_transformer',
+                                   local_train_md5)
+    train0_url = url_prefix + 'train.tok.clean.bpe.32000.en-de.train_0'
+    train0_md5 = 'ddce7f602f352a0405267285379a38b1'
+    paddle.dataset.common.download(train0_url, 'test_dist_transformer',
+                                   train0_md5)
+    train1_url = url_prefix + 'train.tok.clean.bpe.32000.en-de.train_1'
+    train1_md5 = '8757798200180285b1a619cd7f408747'
+    paddle.dataset.common.download(train1_url, 'test_dist_transformer',
+                                   train1_md5)
+    test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de'
+    test_md5 = '9dd74a266dbdb25314183899f269b4a2'
+    paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5)
+class TestDistTransformer2x2Sync(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
    def test_transformer(self):
-        # TODO(paddle-dev): check if the delta is OK.
+        download_files()
-        # Usually start around ~8000 and converge to ~5000
+        #Note: loss on test dataset of the first 5 batch are:
-        self.check_with_place("dist_transformer.py", delta=400)
+        # 10.518872, 10.518871, 10.518868, 10.518862, 10.518855
+        self.check_with_place("dist_transformer.py", delta=1e-7)
+class TestDistTransformer2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+    def test_transformer(self):
+        download_files()
+        self.check_with_place("dist_transformer.py", delta=1.0)
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -22,7 +22,7 @@ class TestDistSeResneXt2x2(TestDistBase):
        self._sync_mode = True
    def test_se_resnext(self):
-        self.check_with_place("dist_word2vec.py", delta=1e-7)
+        self.check_with_place("dist_word2vec.py", delta=1e-4)
 class TestDistSeResneXt2x2Async(TestDistBase):

--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -20,41 +20,50 @@ import math
 from op_test import OpTest
-def quantize_max_abs(x, num_bits):
+def quantize_max_abs(x, max_range):
-    range = math.pow(2, num_bits) - 1
    scale = np.max(np.abs(x).flatten())
-    y = np.round(x / scale * range)
+    y = np.round(x / scale * max_range)
    return y, scale
-def dequantize_max_abs(x, num_bits, scale):
+def dequantize_max_abs(x, scale, max_range):
-    range = math.pow(2, num_bits) - 1
+    y = (scale / max_range) * x
-    y = (scale / range) * x
    return y
 class TestFakeDequantizeMaxAbsOp(OpTest):
    def set_args(self):
        self.num_bits = 8
+        self.max_range = math.pow(2, self.num_bits - 1) - 1
+        self.data_type = "float32"
    def setUp(self):
        self.set_args()
        self.op_type = "fake_dequantize_max_abs"
-        x = np.random.randn(31, 65).astype("float32")
+        x = np.random.randn(31, 65).astype(self.data_type)
-        yq, scale = quantize_max_abs(x, self.num_bits)
+        yq, scale = quantize_max_abs(x, self.max_range)
-        ydq = dequantize_max_abs(yq, self.num_bits, scale)
+        ydq = dequantize_max_abs(yq, scale, self.max_range)
-        self.inputs = {'X': yq}
+        self.inputs = {'X': yq, 'Scale': np.array(scale).astype(self.data_type)}
-        self.attrs = {'num_bits': self.num_bits, 'scale': float(scale)}
+        self.attrs = {'max_range': self.max_range}
        self.outputs = {'Out': ydq}
    def test_check_output(self):
        self.check_output()
-class TestFakeDequantizeMaxAbsOp5Bits(OpTest):
+class TestFakeDequantizeMaxAbsOpDouble(TestFakeDequantizeMaxAbsOp):
+    def set_args(self):
+        self.num_bits = 8
+        self.max_range = math.pow(2, self.num_bits - 1) - 1
+        self.data_type = "float64"
+class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp):
    def set_args(self):
        self.num_bits = 5
+        self.max_range = math.pow(2, self.num_bits - 1) - 1
+        self.data_type = "float32"
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+from test_gru_op import gru
+from test_fusion_lstm_op import fc, ACTIVATION
+def fusion_gru(
+        x,  # T x M
+        lod,  # 1 x N
+        h0,  # N x D
+        wx,  # M x 3D
+        wh,  # D x 3D
+        bias,  # 1 x 3D
+        is_reverse,
+        act_state,
+        act_gate):
+    return gru(fc(x, wx, bias),
+               lod,
+               h0,
+               wh,
+               np.zeros(
+                   (1, wh.shape[1]), dtype='float64'),
+               is_reverse,
+               act_state,
+               act_gate)
+class TestFusionGRUOp(OpTest):
+    def set_confs(self):
+        pass
+    def setUp(self):
+        self.op_type = "fusion_gru"
+        self.lod = [[2, 4, 3]]
+        self.M = 3
+        self.D = 5
+        self.is_reverse = False
+        self.with_h0 = True
+        self.with_bias = True
+        self.act_state = 'tanh'
+        self.act_gate = 'sigmoid'
+        self.set_confs()
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+        x = np.random.rand(T, self.M).astype('float64')
+        wx = np.random.rand(self.M, 3 * self.D).astype('float64')
+        wh = np.random.rand(self.D, 3 * self.D).astype('float64')
+        bias = np.random.rand(
+            1, 3 * self.D).astype('float64') if self.with_bias else np.zeros(
+                (1, 3 * self.D), dtype='float64')
+        h0 = np.random.rand(
+            N, self.D).astype('float64') if self.with_h0 else np.zeros(
+                (N, self.D), dtype='float64')
+        _, _, _, hidden = fusion_gru(
+            x, self.lod, h0, wx, wh, bias, self.is_reverse,
+            ACTIVATION[self.act_state], ACTIVATION[self.act_gate])
+        self.inputs = {'X': (x, self.lod), 'WeightX': wx, 'WeightH': wh}
+        if self.with_bias:
+            self.inputs['Bias'] = bias
+        if self.with_h0:
+            self.inputs['H0'] = h0
+        self.outputs = {'Hidden': (hidden, self.lod)}
+        self.attrs = {
+            'activation': self.act_state,
+            'gate_activation': self.act_gate,
+            'is_reverse': self.is_reverse
+        }
+    def test_check_output(self):
+        self.check_output(atol=1e-8)
+class TestFusionGRUOpNoInitial(TestFusionGRUOp):
+    def set_confs(self):
+        self.with_h0 = False
+class TestFusionGRUOpNoBias(TestFusionGRUOp):
+    def set_confs(self):
+        self.with_bias = False
+class TestFusionGRUOpReverse(TestFusionGRUOp):
+    def set_confs(self):
+        self.is_reverse = True
+class TestFusionGRUOpMD1(TestFusionGRUOp):
+    def set_confs(self):
+        self.M = 36
+        self.D = 8
+class TestFusionGRUOpMD2(TestFusionGRUOp):
+    def set_confs(self):
+        self.M = 8
+        self.D = 8
+class TestFusionGRUOpBS1(TestFusionGRUOp):
+    def set_confs(self):
+        self.lod = [[3]]
+        self.D = 16
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -43,13 +43,13 @@ def fusion_lstm(
        act_cell, act_cand)
-class TestLstmOp(OpTest):
+class TestFusionLSTMOp(OpTest):
-    def set_argument(self):
+    def set_conf(self):
-        self.lod = [[2, 3, 2]]
+        pass
    def setUp(self):
        self.op_type = 'fusion_lstm'
-        self.lod = [[2, 3, 2]]
+        self.lod = [[2, 3, 5, 4]]
        self.M = 8
        self.D = 16
        self.has_initial_state = False
@@ -58,33 +58,33 @@ class TestLstmOp(OpTest):
        self.act_cell = 'tanh'
        self.act_cand = 'tanh'
        self.use_peepholes = False
-        self.set_argument()
+        self.set_conf()
        T = sum(self.lod[0])
        bs = len(self.lod[0])
-        x = np.random.normal(size=(T, self.M)).astype('float64')
+        x = np.random.normal(size=(T, self.M)).astype('float32')
        if self.has_initial_state:
-            h0 = np.random.normal(size=(bs, self.D)).astype('float64')
+            h0 = np.random.normal(size=(bs, self.D)).astype('float32')
-            c0 = np.random.normal(size=(bs, self.D)).astype('float64')
+            c0 = np.random.normal(size=(bs, self.D)).astype('float32')
        else:
-            h0 = np.zeros((bs, self.D)).astype('float64')
+            h0 = np.zeros((bs, self.D)).astype('float32')
-            c0 = np.zeros((bs, self.D)).astype('float64')
+            c0 = np.zeros((bs, self.D)).astype('float32')
-        wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float64')
+        wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float32')
        if self.use_peepholes:
-            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float32')
        else:
-            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float32')
        w_b = np.copy(b[:, 0:4 * self.D])
        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
        # this is the weight of fc
-        wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float64')
+        wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float32')
        # this is the bias of fc
        # and it should be manually added into the bias of this fusion LSTM
-        bx = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+        bx = np.random.normal(size=(1, 4 * self.D)).astype('float32')
        b[0, 0:4 * self.D] += bx[0, :]
        h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c,
                           self.is_reverse, ACTIVATION[self.act_gate],
@@ -114,35 +114,45 @@ class TestLstmOp(OpTest):
        }
    def test_check_output(self):
-        self.check_output(atol=1e-8)
+        self.check_output()
-class TestLstmOpInitReverse(TestLstmOp):
+class TestFusionLSTMOpInit(TestFusionLSTMOp):
-    def set_argument(self):
+    def set_conf(self):
+        self.has_initial_state = True
+class TestFusionLSTMOpReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.is_reverse = True
+class TestFusionLSTMOpInitReverse(TestFusionLSTMOp):
+    def set_conf(self):
        self.has_initial_state = True
        self.is_reverse = True
-class TestLstmOpMD1(TestLstmOp):
+class TestFusionLSTMOpMD1(TestFusionLSTMOp):
-    def set_argument(self):
+    def set_conf(self):
        self.M = 36
        self.D = 8
-class TestLstmOpMD2(TestLstmOp):
+class TestFusionLSTMOpMD2(TestFusionLSTMOp):
-    def set_argument(self):
+    def set_conf(self):
        self.M = 8
        self.D = 8
-class TestLstmOpMD3(TestLstmOp):
+class TestFusionLSTMOpMD3(TestFusionLSTMOp):
-    def set_argument(self):
+    def set_conf(self):
        self.M = 15
        self.D = 3
-class TestLstmOpBS1(TestLstmOp):
+class TestFusionLSTMOpBS1(TestFusionLSTMOp):
-    def set_argument(self):
+    def set_conf(self):
        self.lod = [[3]]
        self.D = 16

--- a/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_fusion_lstm_op import fc, ACTIVATION
+def fusion_seqexpand_concat_fc(xs, lod, w, b, fc_act):
+    T = sum(lod[0])
+    N = len(lod[0])
+    num_inputs = len(xs)
+    D = w.shape[1]
+    expanded_inputs = [xs[0]]
+    for i in range(num_inputs - 1):
+        x = xs[i + 1]
+        assert x.shape[0] == N
+        expanded = np.repeat(x, lod[0], axis=0)
+        assert expanded.shape[0] == T
+        assert expanded.shape[1] == x.shape[1]
+        expanded_inputs.append(expanded)
+    fc_input = np.concatenate(expanded_inputs, axis=1)
+    assert fc_input.shape[0] == T
+    assert fc_input.shape[1] == w.shape[0]
+    fc_out = fc(fc_input, w, b)
+    fc_out = fc_act(fc_out)
+    assert fc_out.shape[0] == T
+    assert fc_out.shape[1] == D
+    return fc_out
+class TestFusionSeqExpandConcatFCOp(OpTest):
+    def set_conf(self):
+        pass
+    def setUp(self):
+        self.op_type = 'fusion_seqexpand_concat_fc'
+        self.lod = [[3, 5, 8, 2]]
+        self.inputs_M = [15, 10, 10]
+        self.D = 20
+        self.with_bias = True
+        self.fc_act = 'relu'
+        self.set_conf()
+        T = sum(self.lod[0])
+        bs = len(self.lod[0])
+        num_inputs = len(self.inputs_M)
+        x0 = np.random.normal(size=(T, self.inputs_M[0])).astype('float32')
+        xs = [x0]
+        for i in range(num_inputs - 1):
+            xi = np.random.normal(size=(bs,
+                                        self.inputs_M[i + 1])).astype('float32')
+            xs.append(xi)
+        # fc weight and bias
+        w = np.random.normal(size=(sum(self.inputs_M),
+                                   self.D)).astype('float32')
+        b = np.random.normal(size=(
+            1, self.D)).astype('float32') if self.with_bias else np.zeros(
+                (1, self.D)).astype('float32')
+        out = fusion_seqexpand_concat_fc(xs, self.lod, w, b,
+                                         ACTIVATION[self.fc_act])
+        self.inputs = {'X': [('x0', (x0, self.lod))], 'FCWeight': w}
+        normal_lod = [[1] * bs]
+        for i in range(num_inputs - 1):
+            self.inputs['X'].append(('x%d' % (i + 1), (xs[i + 1], normal_lod)))
+        if self.with_bias:
+            self.inputs['FCBias'] = b
+        self.outputs = {'Out': (out, self.lod)}
+        self.attrs = {'fc_activation': self.fc_act}
+    def test_check_output(self):
+        self.check_output()
+class TestFusionSECFCOpNonBias(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.with_bias = False
+class TestFusionSECFCOpNonAct(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.fc_act = 'identity'
+class TestFusionSECFCOpMD1(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.inputs_M = [3, 4, 2, 1, 5]
+        self.D = 8
+class TestFusionSECFCOpMD2(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.lod = [[5, 6]]
+        self.inputs_M = [1, 1]
+class TestFusionSECFCOpBS1_1(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.lod = [[1]]
+        self.inputs_M = [3, 4, 2]
+class TestFusionSECFCOpBS1_2(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.lod = [[1]]
+        self.inputs_M = [3, 4]
+class TestFusionSECFCOpBS1_3(TestFusionSeqExpandConcatFCOp):
+    def set_conf(self):
+        self.lod = [[5]]
+        self.inputs_M = [6, 3]
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import sys
+import math
+import paddle.fluid as fluid
+from op_test import OpTest
+from test_multiclass_nms_op import nms
+from test_anchor_generator_op import anchor_generator_in_python
+import copy
+def generate_proposals_in_python(scores, bbox_deltas, im_info, anchors,
+                                 variances, pre_nms_topN, post_nms_topN,
+                                 nms_thresh, min_size, eta):
+    all_anchors = anchors.reshape(-1, 4)
+    rois = np.empty((0, 5), dtype=np.float32)
+    roi_probs = np.empty((0, 1), dtype=np.float32)
+    rpn_rois = []
+    rpn_roi_probs = []
+    lod = []
+    num_images = scores.shape[0]
+    for img_idx in range(num_images):
+        img_i_boxes, img_i_probs = proposal_for_one_image(
+            im_info[img_idx, :], all_anchors, variances,
+            bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :],
+            pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta)
+        lod.append(img_i_probs.shape[0])
+        rpn_rois.append(img_i_boxes)
+        rpn_roi_probs.append(img_i_probs)
+    return rpn_rois, rpn_roi_probs, lod
+def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
+                           pre_nms_topN, post_nms_topN, nms_thresh, min_size,
+                           eta):
+    # Transpose and reshape predicted bbox transformations to get them
+    # into the same order as the anchors:
+    #   - bbox deltas will be (4 * A, H, W) format from conv output
+    #   - transpose to (H, W, 4 * A)
+    #   - reshape to (H * W * A, 4) where rows are ordered by (H, W, A)
+    #     in slowest to fastest order to match the enumerated anchors
+    bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape(-1, 4)
+    all_anchors = all_anchors.reshape(-1, 4)
+    variances = variances.reshape(-1, 4)
+    # Same story for the scores:
+    #   - scores are (A, H, W) format from conv output
+    #   - transpose to (H, W, A)
+    #   - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
+    #     to match the order of anchors and bbox_deltas
+    scores = scores.transpose((1, 2, 0)).reshape(-1, 1)
+    # sort all (proposal, score) pairs by score from highest to lowest
+    # take top pre_nms_topN (e.g. 6000)
+    if pre_nms_topN <= 0 or pre_nms_topN >= len(scores):
+        order = np.argsort(-scores.squeeze())
+    else:
+        # Avoid sorting possibly large arrays;
+        # First partition to get top K unsorted
+        # and then sort just thoes
+        inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN]
+        order = np.argsort(-scores[inds].squeeze())
+        order = inds[order]
+    scores = scores[order, :]
+    bbox_deltas = bbox_deltas[order, :]
+    all_anchors = all_anchors[order, :]
+    proposals = box_coder(all_anchors, bbox_deltas, variances)
+    # clip proposals to image (may result in proposals with zero area
+    # that will be removed in the next step)
+    proposals = clip_tiled_boxes(proposals, im_info[:2])
+    # remove predicted boxes with height or width < min_size
+    keep = filter_boxes(proposals, min_size, im_info)
+    proposals = proposals[keep, :]
+    scores = scores[keep, :]
+    # apply loose nms (e.g. threshold = 0.7)
+    # take post_nms_topN (e.g. 1000)
+    # return the top proposals
+    if nms_thresh > 0:
+        keep = nms(boxes=proposals,
+                   scores=scores,
+                   nms_threshold=nms_thresh,
+                   eta=eta)
+        if post_nms_topN > 0 and post_nms_topN < len(keep):
+            keep = keep[:post_nms_topN]
+        proposals = proposals[keep, :]
+        scores = scores[keep, :]
+    return proposals, scores
+def box_coder(all_anchors, bbox_deltas, variances):
+    """
+    Decode proposals by anchors and bbox_deltas from RPN 
+    """
+    #proposals: xmin, ymin, xmax, ymax
+    proposals = np.zeros_like(bbox_deltas, dtype=np.float32)
+    #anchor_loc: width, height, center_x, center_y
+    anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)
+    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0]
+    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1]
+    anchor_loc[:, 2] = (all_anchors[:, 2] + all_anchors[:, 0]) / 2
+    anchor_loc[:, 3] = (all_anchors[:, 3] + all_anchors[:, 1]) / 2
+    #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height 
+    pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32)
+    if variances is not None:
+        for i in range(bbox_deltas.shape[0]):
+            pred_bbox[i, 0] = variances[i, 0] * bbox_deltas[i, 0] * anchor_loc[
+                i, 0] + anchor_loc[i, 2]
+            pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[
+                i, 1] + anchor_loc[i, 3]
+            pred_bbox[i, 2] = math.exp(variances[i, 2] *
+                                       bbox_deltas[i, 2]) * anchor_loc[i, 0]
+            pred_bbox[i, 3] = math.exp(variances[i, 3] *
+                                       bbox_deltas[i, 3]) * anchor_loc[i, 1]
+    else:
+        for i in range(bbox_deltas.shape[0]):
+            pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[
+                i, 2]
+            pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[
+                i, 3]
+            pred_bbox[i, 2] = math.exp(bbox_deltas[i, 2]) * anchor_loc[i, 0]
+            pred_bbox[i, 3] = math.exp(bbox_deltas[i, 3]) * anchor_loc[i, 1]
+    proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
+    proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
+    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2
+    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2
+    return proposals
+def clip_tiled_boxes(boxes, im_shape):
+    """Clip boxes to image boundaries. im_shape is [height, width] and boxes
+    has shape (N, 4 * num_tiled_boxes)."""
+    assert boxes.shape[1] % 4 == 0, \
+        'boxes.shape[1] is {:d}, but must be divisible by 4.'.format(
+        boxes.shape[1]
+    )
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+def filter_boxes(boxes, min_size, im_info):
+    """Only keep boxes with both sides >= min_size and center within the image.
+    """
+    # Scale min_size to match image scale
+    min_size *= im_info[2]
+    ws = boxes[:, 2] - boxes[:, 0] + 1
+    hs = boxes[:, 3] - boxes[:, 1] + 1
+    x_ctr = boxes[:, 0] + ws / 2.
+    y_ctr = boxes[:, 1] + hs / 2.
+    keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_info[1]) &
+                    (y_ctr < im_info[0]))[0]
+    return keep
+def iou(box_a, box_b):
+    """
+	Apply intersection-over-union overlap between box_a and box_b
+    """
+    xmin_a = min(box_a[0], box_a[2])
+    ymin_a = min(box_a[1], box_a[3])
+    xmax_a = max(box_a[0], box_a[2])
+    ymax_a = max(box_a[1], box_a[3])
+    xmin_b = min(box_b[0], box_b[2])
+    ymin_b = min(box_b[1], box_b[3])
+    xmax_b = max(box_b[0], box_b[2])
+    ymax_b = max(box_b[1], box_b[3])
+    area_a = (ymax_a - ymin_a + 1) * (xmax_a - xmin_a + 1)
+    area_b = (ymax_b - ymin_b + 1) * (xmax_b - xmin_b + 1)
+    if area_a <= 0 and area_b <= 0:
+        return 0.0
+    xa = max(xmin_a, xmin_b)
+    ya = max(ymin_a, ymin_b)
+    xb = min(xmax_a, xmax_b)
+    yb = min(ymax_a, ymax_b)
+    inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0)
+    iou_ratio = inter_area / (area_a + area_b - inter_area)
+    return iou_ratio
+def nms(boxes, scores, nms_threshold, eta=1.0):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        nms_threshold: (float) The overlap thresh for suppressing unnecessary
+            boxes.
+        eta: (float) The parameter for adaptive NMS.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+    all_scores = copy.deepcopy(scores)
+    all_scores = all_scores.flatten()
+    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
+    sorted_scores = all_scores[sorted_indices]
+    selected_indices = []
+    adaptive_threshold = nms_threshold
+    for i in range(sorted_scores.shape[0]):
+        idx = sorted_indices[i]
+        keep = True
+        for k in range(len(selected_indices)):
+            if keep:
+                kept_idx = selected_indices[k]
+                overlap = iou(boxes[idx], boxes[kept_idx])
+                keep = True if overlap <= adaptive_threshold else False
+            else:
+                break
+        if keep:
+            selected_indices.append(idx)
+        if keep and eta < 1 and adaptive_threshold > 0.5:
+            adaptive_threshold *= eta
+    return selected_indices
+class TestGenerateProposalsOp(OpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {
+            'Scores': self.scores,
+            'BboxDeltas': self.bbox_deltas,
+            'ImInfo': self.im_info.astype(np.float32),
+            'Anchors': self.anchors,
+            'Variances': self.variances
+        }
+        self.attrs = {
+            'pre_nms_topN': self.pre_nms_topN,
+            'post_nms_topN': self.post_nms_topN,
+            'nms_thresh': self.nms_thresh,
+            'min_size': self.min_size,
+            'eta': self.eta
+        }
+        print("lod = ", self.lod)
+        self.outputs = {
+            'RpnRois': (self.rpn_rois[0], [self.lod]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod])
+        }
+    def test_check_output(self):
+        self.check_output()
+    def setUp(self):
+        self.op_type = "generate_proposals"
+        self.set_data()
+    def init_test_params(self):
+        self.pre_nms_topN = 12000  # train 12000, test 2000
+        self.post_nms_topN = 5000  # train 6000, test 1000
+        self.nms_thresh = 0.7
+        self.min_size = 3.0
+        self.eta = 0.8
+    def init_test_input(self):
+        batch_size = 1
+        input_channels = 20
+        layer_h = 16
+        layer_w = 16
+        input_feat = np.random.random(
+            (batch_size, input_channels, layer_h, layer_w)).astype('float32')
+        self.anchors, self.variances = anchor_generator_in_python(
+            input_feat=input_feat,
+            anchor_sizes=[16., 32.],
+            aspect_ratios=[0.5, 1.0],
+            variances=[1.0, 1.0, 1.0, 1.0],
+            stride=[16.0, 16.0],
+            offset=0.5)
+        self.im_info = np.array([[64., 64., 8.]])  #im_height, im_width, scale
+        num_anchors = self.anchors.shape[2]
+        self.scores = np.random.random(
+            (batch_size, num_anchors, layer_h, layer_w)).astype('float32')
+        self.bbox_deltas = np.random.random(
+            (batch_size, num_anchors * 4, layer_h, layer_w)).astype('float32')
+    def init_test_output(self):
+        self.rpn_rois, self.rpn_roi_probs, self.lod = generate_proposals_in_python(
+            self.scores, self.bbox_deltas, self.im_info, self.anchors,
+            self.variances, self.pre_nms_topN, self.post_nms_topN,
+            self.nms_thresh, self.min_size, self.eta)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -19,22 +19,19 @@ import numpy as np
 import math
 import functools
 from op_test import OpTest
-from test_lstm_op import identity, sigmoid, tanh, relu
+from test_lstm_op import ACTIVATION
-class TestGRUOp(OpTest):
+def gru(
-    lod = [[2, 4, 3]]
+        input,  # T x 3D
-    batch_size = sum(lod[0])
+        lod,  # 1 x N
-    frame_size = 5
+        h0,  # N x D
-    activate = {
+        weight,  # D x 3D
-        'identity': identity,
+        bias,  # 1 x 3D
-        'sigmoid': sigmoid,
+        is_reverse,
-        'tanh': tanh,
+        act_state,
-        'relu': relu
+        act_gate):
-    }
+    def _seq_to_batch(lod, is_reverse):
-    @staticmethod
-    def seq_to_batch(lod, is_reverse):
        idx_in_seq_list = []
        seq_lens = lod[0]
        seq_starts = [0]
@@ -56,121 +53,125 @@ class TestGRUOp(OpTest):
            idx_in_seq_list.append(idx_in_seq)
        return idx_in_seq_list, sorted_seqs
-    def gru_step(self, x, h_p, w, b):
+    def _step(x, h_p, w, b, act_state, act_gate):
-        batch_size = x.shape[0]
+        T = x.shape[0]
-        frame_size = w.shape[0]
+        D = w.shape[0]
-        g = x + np.tile(b, (batch_size, 1))
+        g = x + np.tile(b, (T, 1))
-        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
+        w_u_r = w.flatten()[:D * D * 2].reshape((D, D * 2))
-            (frame_size, frame_size * 2))
+        u_r = act_gate(np.dot(h_p, w_u_r) + g[:, :D * 2])
-        u_r = self.activate[self.attrs['gate_activation']](np.dot(
+        u = u_r[:, :D]
-            h_p, w_u_r) + g[:, :frame_size * 2])
+        r = u_r[:, D:D * 2]
-        u = u_r[:, :frame_size]
-        r = u_r[:, frame_size:frame_size * 2]
        r_h_p = r * h_p
-        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
+        w_c = w.flatten()[D * D * 2:].reshape((D, D))
-            (frame_size, frame_size))
+        c = act_state(np.dot(r_h_p, w_c) + g[:, D * 2:])
-        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
-                                                    g[:, frame_size * 2:])
        g = np.hstack((u_r, c))
        h = u * c + (1 - u) * h_p
        return g, r_h_p, h
-    def gru(self):
+    T = sum(lod[0])
-        input, lod = self.inputs['Input']
+    N = len(lod[0])
-        w = self.inputs['Weight']
+    D = weight.shape[0]
-        b = self.inputs['Bias'] if 'Bias' in self.inputs else np.zeros(
+    batch_gate = np.zeros((T, 3 * D), dtype='float64')
-            (1, self.frame_size * 3))
+    batch_reset_hidden_prev = np.zeros((T, D), dtype='float64')
-        batch_gate = self.outputs['BatchGate']
+    batch_hidden = np.zeros((T, D), dtype='float64')
-        batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
+    hidden = np.zeros((T, D), dtype='float64')
-        batch_hidden = self.outputs['BatchHidden']
-        hidden = self.outputs['Hidden']
+    idx_in_seq_list, sorted_seqs = _seq_to_batch(lod, is_reverse)
-        idx_in_seq_list = self.idx_in_seq_list
+    h_p = h0[sorted_seqs]
-        h_p = self.inputs['H0'][
+    max_seq_len = len(idx_in_seq_list)
-            self.sorted_seqs] if 'H0' in self.inputs else np.zeros(
+    assert len(idx_in_seq_list[0]) == N
-                (len(idx_in_seq_list[0]), self.frame_size))
+    end_idx = 0
-        num_batch = len(idx_in_seq_list)
+    for batch_idx in range(max_seq_len):
-        end_idx = 0
+        x = input[idx_in_seq_list[batch_idx]]
-        for batch_idx in range(num_batch):
+        g, r_h_p, h = _step(x, h_p, weight, bias, act_state, act_gate)
-            x = input[idx_in_seq_list[batch_idx]]
+        if batch_idx < (max_seq_len - 1):
-            g, r_h_p, h = self.gru_step(x, h_p, w, b)
+            h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
-            if batch_idx < (num_batch - 1):
+        start_idx = end_idx
-                h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
+        end_idx = start_idx + len(idx_in_seq_list[batch_idx])
-            start_idx = end_idx
+        batch_gate[start_idx:end_idx] = g
-            end_idx = start_idx + len(idx_in_seq_list[batch_idx])
+        batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
-            batch_gate[start_idx:end_idx] = g
+        batch_hidden[start_idx:end_idx] = h
-            batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
+        hidden[idx_in_seq_list[batch_idx]] = h
-            batch_hidden[start_idx:end_idx] = h
+    return batch_gate, batch_reset_hidden_prev, batch_hidden, hidden
-            hidden[idx_in_seq_list[batch_idx]] = h
-        return batch_gate, batch_reset_hidden_prev, hidden
-    def set_data(self):
-        lod = self.lod
-        self.idx_in_seq_list, self.sorted_seqs = self.seq_to_batch(
-            lod, self.is_reverse)
-        batch_size = self.batch_size
-        frame_size = self.frame_size
-        input = np.random.rand(batch_size, frame_size * 3).astype('float64')
-        h0 = np.random.rand(len(self.idx_in_seq_list[0]),
-                            frame_size).astype('float64')
-        weight = np.random.rand(frame_size, frame_size * 3).astype('float64')
-        bias = np.random.rand(1, frame_size * 3).astype('float64')
-        self.inputs = {
-            'Input': (input, lod),
-            'H0': h0,
-            'Weight': weight,
-            'Bias': bias
-        }
-        self.outputs = {
-            'BatchGate': np.zeros(
-                (batch_size, frame_size * 3), dtype='float64'),
-            'BatchResetHiddenPrev': np.zeros(
-                (batch_size, frame_size), dtype='float64'),
-            'BatchHidden': np.zeros(
-                (batch_size, frame_size), dtype='float64'),
-            'Hidden': np.zeros(
-                (batch_size, frame_size), dtype='float64')
-        }
+class TestGRUOp(OpTest):
    def set_confs(self):
-        self.is_reverse = False
+        pass
-        self.attrs = {
-            'activation': 'tanh',
-            'gate_activation': 'sigmoid',
-            'is_reverse': self.is_reverse
-        }
    def setUp(self):
        self.op_type = "gru"
+        self.lod = [[2, 4, 3]]
+        self.D = 5
+        self.is_reverse = False
+        self.with_h0 = True
+        self.with_bias = True
+        self.act_state = 'tanh'
+        self.act_gate = 'sigmoid'
        self.set_confs()
-        self.set_data()
-        self.gru()
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+        input = np.random.rand(T, 3 * self.D).astype('float64')
+        weight = np.random.rand(self.D, 3 * self.D).astype('float64')
+        bias = np.random.rand(
+            1, 3 * self.D).astype('float64') if self.with_bias else np.zeros(
+                (1, 3 * self.D), dtype='float64')
+        h0 = np.random.rand(
+            N, self.D).astype('float64') if self.with_h0 else np.zeros(
+                (N, self.D), dtype='float64')
+        batch_gate, batch_reset_hidden_prev, batch_hidden, hidden = gru(
+            input, self.lod, h0, weight, bias, self.is_reverse,
+            ACTIVATION[self.act_state], ACTIVATION[self.act_gate])
+        self.inputs = {'Input': (input, self.lod), 'Weight': weight}
+        if self.with_bias:
+            self.inputs['Bias'] = bias
+        if self.with_h0:
+            self.inputs['H0'] = h0
+        self.outputs = {
+            'Hidden': (hidden, self.lod),
+            'BatchGate': batch_gate,
+            'BatchResetHiddenPrev': batch_reset_hidden_prev,
+            'BatchHidden': batch_hidden,
+        }
+        self.attrs = {
+            'activation': self.act_state,
+            'gate_activation': self.act_gate,
+            'is_reverse': self.is_reverse
+        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(atol=1e-8)
    def test_check_grad(self):
        self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
 class TestGRUOpNoInitial(TestGRUOp):
-    def set_data(self):
+    def set_confs(self):
-        super(TestGRUOpNoInitial, self).set_data()
+        self.with_h0 = False
-        self.inputs.pop('H0')
    def test_check_grad(self):
        self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'])
+class TestGRUOpNoBias(TestGRUOp):
+    def set_confs(self):
+        self.with_bias = False
+    def test_check_grad(self):
+        self.check_grad(['Input', 'H0', 'Weight'], ['Hidden'])
 class TestGRUOpReverse(TestGRUOp):
    def set_confs(self):
        self.is_reverse = True
-        self.attrs = {
-            'activation': 'tanh',
-            'gate_activation': 'sigmoid',
-            'is_reverse': self.is_reverse
-        }
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_pad_constant_like.py
+++ b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestPadOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "pad_constant_like"
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype("float32"),
+            'Y': np.random.random(self.y_shape).astype("float32")
+        }
+        self.attrs = {}
+        self.attrs['pad_value'] = self.pad_value
+        self.outputs = {
+            'Out': np.pad(self.inputs['Y'],
+                          self.paddings,
+                          mode='constant',
+                          constant_values=self.pad_value)
+        }
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad_normal(self):
+        self.check_grad(['Y'], 'Out', max_relative_error=0.006)
+    def initTestCase(self):
+        self.x_shape = (16, 16)
+        self.y_shape = (3, 16)
+        self.pad_value = 0.1
+        self.paddings = [(0, 13), (0, 0)]
+class TestCase1(TestPadOp):
+    def initTestCase(self):
+        self.x_shape = (4, 3, 4, 4)
+        self.y_shape = (2, 3, 4, 4)
+        self.paddings = [(0, 2), (0, 0), (0, 0), (0, 0)]
+        self.pad_value = 0.5
+class TestCase2(TestPadOp):
+    def initTestCase(self):
+        self.x_shape = (4, 3, 4, 4)
+        self.y_shape = (2, 3, 2, 4)
+        self.paddings = [(0, 2), (0, 0), (0, 2), (0, 0)]
+        self.pad_value = 0.5
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -35,9 +35,8 @@ class TestPrintOpCPU(unittest.TestCase):
    def build_network(self, only_forward, **kargs):
        x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
        x.stop_gradient = False
-        printed = layers.Print(input=x, **kargs)
+        layers.Print(input=x, **kargs)
-        if only_forward: return printed
+        loss = layers.mean(x)
-        loss = layers.mean(printed)
        append_backward(loss=loss)
        return loss

--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 class TestScaleOp(OpTest):
@@ -33,5 +35,57 @@ class TestScaleOp(OpTest):
        self.check_grad(['X'], 'Out')
+class TestScaleOpSelectedRows(unittest.TestCase):
+    def check_with_place(self, place, in_name, out_name):
+        scope = core.Scope()
+        # create and initialize Grad Variable
+        in_height = 10
+        in_rows = [0, 4, 7]
+        in_row_numel = 12
+        scale = 2.0
+        in_selected_rows = scope.var(in_name).get_selected_rows()
+        in_selected_rows.set_height(in_height)
+        in_selected_rows.set_rows(in_rows)
+        in_array = np.random.random(
+            (len(in_rows), in_row_numel)).astype("float32")
+        in_tensor = in_selected_rows.get_tensor()
+        in_tensor.set(in_array, place)
+        # create and initialize Param Variable
+        out_selected_rows = scope.var(out_name).get_selected_rows()
+        out_tensor = out_selected_rows.get_tensor()
+        out_tensor._set_dims(in_tensor._get_dims())
+        # create and run sgd operator
+        scale_op = Operator("scale", X=in_name, Out=out_name, scale=scale)
+        scale_op.run(scope, place)
+        # get and compare result
+        out_height = out_selected_rows.height()
+        out_rows = out_selected_rows.rows()
+        result_array = np.array(out_tensor)
+        assert (in_array * scale == result_array).all()
+        assert in_height == out_height
+        assert in_rows == out_rows
+    def test_scale_selected_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place, 'in', 'out')
+    def test_scale_selected_rows_inplace(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place, 'in', 'in')
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestSequencePadOp(OpTest):
+    def set_attr(self):
+        self.x_shape = [12, 4]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0]
+        self.padded_length = -1
+        self.dtype = 'float32'
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 0.5, self.x_shape).astype(self.dtype)
+        pad_value_data = np.array(self.pad_value).astype(self.dtype)
+        self.inputs = {
+            'X': (x_data, self.x_len_lod),
+            'PadValue': pad_value_data
+        }
+        self.attrs = {'padded_length': self.padded_length}
+    def compute(self):
+        # get padded length
+        padded_length = self.padded_length
+        x_len_lod_0 = self.x_len_lod[0]
+        if padded_length == -1:
+            max_seq_len = 0
+            for l in x_len_lod_0:
+                max_seq_len = max(max_seq_len, l)
+            padded_length = max_seq_len
+        # do padding
+        x_data = self.inputs['X'][0]
+        pad_value_data = self.inputs['PadValue']
+        if pad_value_data.shape == (1, ):
+            pad_value_data = np.broadcast_to(
+                pad_value_data, shape=x_data.shape[1:])
+        padded_sequences = []
+        start_idx = 0
+        for l in x_len_lod_0:
+            end_idx = start_idx + l
+            seq = x_data[start_idx:end_idx]
+            to_pad_len = padded_length - l
+            for _ in range(to_pad_len):
+                seq = np.append(seq, pad_value_data[np.newaxis, :], axis=0)
+            padded_sequences.append(seq)
+            start_idx = end_idx
+        out_data = np.array(padded_sequences)
+        self.outputs = {'Out': out_data}
+    def setUp(self):
+        self.op_type = 'sequence_pad'
+        self.set_attr()
+        self.set_data()
+        self.compute()
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+class TestSequencePadOp2(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 4]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0, 2.0, 3.0, 4.0]
+        self.padded_length = -1
+        self.dtype = 'float32'
+class TestSequencePadOp3(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 4]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0]
+        self.padded_length = 7
+        self.dtype = 'float32'
+class TestSequencePadOp4(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 4]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0, 2.0, 3.0, 4.0]
+        self.padded_length = 7
+        self.dtype = 'float32'
+class TestSequencePadOp5(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 2, 2]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0]
+        self.padded_length = -1
+        self.dtype = 'float32'
+class TestSequencePadOp6(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 2, 2]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [[1.0, 2.0], [3.0, 4.0]]
+        self.padded_length = -1
+        self.dtype = 'float32'
+class TestSequencePadOp7(TestSequencePadOp):
+    def set_attr(self):
+        self.x_shape = [12, 2, 2]
+        self.x_len_lod = [[2, 3, 4, 3]]
+        self.pad_value = [1.0]
+        self.padded_length = 7
+        self.dtype = 'float32'
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -59,6 +59,27 @@ class TestTensor(unittest.TestCase):
        self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
        self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
+    def test_int8_tensor(self):
+        scope = core.Scope()
+        var = scope.var("int8_tensor")
+        cpu_tensor = var.get_tensor()
+        tensor_array = numpy.random.randint(
+            -127, high=128, size=[100, 200], dtype=numpy.int8)
+        place = core.CPUPlace()
+        cpu_tensor.set(tensor_array, place)
+        cpu_tensor_array_2 = numpy.array(cpu_tensor)
+        self.assertAlmostEqual(cpu_tensor_array_2.all(), tensor_array.all())
+        if core.is_compiled_with_cuda():
+            cuda_tensor = var.get_tensor()
+            tensor_array = numpy.random.randint(
+                -127, high=128, size=[100, 200], dtype=numpy.int8)
+            place = core.CUDAPlace(0)
+            cuda_tensor.set(tensor_array, place)
+            cuda_tensor_array_2 = numpy.array(cuda_tensor)
+            self.assertAlmostEqual(cuda_tensor_array_2.all(),
+                                   tensor_array.all())
    def test_int_lod_tensor(self):
        place = core.CPUPlace()
        scope = core.Scope()

--- a/python/paddle/fluid/tests/unittests/test_unstack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unstack_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from op_test import OpTest
+import numpy as np
+import unittest
+class TestUnStackOpBase(OpTest):
+    def initDefaultParameters(self):
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'float32'
+    def initParameters(self):
+        pass
+    def get_y_names(self):
+        y_names = []
+        for i in range(self.input_dim[self.axis]):
+            y_names.append('y{}'.format(i))
+        return y_names
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = 'unstack'
+        self.x = np.random.random(size=self.input_dim).astype(self.dtype)
+        outs = np.split(self.x, self.input_dim[self.axis], self.axis)
+        new_shape = list(self.input_dim)
+        del new_shape[self.axis]
+        y_names = self.get_y_names()
+        tmp = []
+        for i in range(self.input_dim[self.axis]):
+            tmp.append((y_names[i], np.reshape(outs[i], new_shape)))
+        self.inputs = {'X': self.x}
+        self.outputs = {'Y': tmp}
+        self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad('X', self.get_y_names())
+class TestStackOp3(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = -1
+class TestStackOp4(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = -3
+class TestStackOp5(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = 1
+class TestStackOp6(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = 2
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -31,7 +31,8 @@ class TestVariable(unittest.TestCase):
        self.assertEqual(DT.INT16, convert("int16"))
        self.assertEqual(DT.INT64, convert("int64"))
        self.assertEqual(DT.BOOL, convert("bool"))
-        self.assertRaises(ValueError, lambda: convert("int8"))
+        self.assertEqual(DT.INT8, convert("int8"))
+        self.assertEqual(DT.UINT8, convert("uint8"))
    def test_var(self):
        b = default_main_program().current_block()

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -31,7 +31,6 @@ Steps to transpile pserver:
 """
 import math
-import random
 import numpy as np
 import collections
 import six
@@ -239,8 +238,8 @@ class DistributeTranspiler(object):
        grad_var_mapping_items = list(six.iteritems(self.grad_var_mapping))
        if not self.config.slice_var_up:
-            random.seed(self.origin_program.random_seed)
+            np.random.seed(self.origin_program.random_seed)
-            random.shuffle(grad_var_mapping_items)
+            np.random.shuffle(grad_var_mapping_items)
        grad_name_to_send_dummy_out = dict()
        for grad_varname, splited_vars in grad_var_mapping_items:
@@ -284,10 +283,13 @@ class DistributeTranspiler(object):
                send_vars.append(var)
        if self.sync_mode:
+            send_barrier_out = program.global_block().create_var(
+                name=framework.generate_control_dev_var_name())
+            input_deps = grad_name_to_send_dummy_out.values()
            program.global_block().append_op(
                type="send_barrier",
-                inputs={},
+                inputs={"X": input_deps},
-                outputs={},
+                outputs={"Out": send_barrier_out},
                attrs={
                    "endpoints": pserver_endpoints,
                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
@@ -305,16 +307,22 @@ class DistributeTranspiler(object):
            self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
        # step4: Concat the parameters splits together after recv.
+        all_recv_outputs = []
        for param_varname, splited_var in six.iteritems(self.param_var_mapping):
            eps = []
            for var in splited_var:
                index = [v.name for v in recv_vars].index(var.name)
                eps.append(eplist[index])
-            grad_send_dummy_out = grad_name_to_send_dummy_out[
+            if self.sync_mode:
-                self.param_name_to_grad_name[param_varname]]
+                recv_dep_in = send_barrier_out
+            else:
+                # connect deps to send op in async mode
+                recv_dep_in = grad_name_to_send_dummy_out[
+                    self.param_name_to_grad_name[param_varname]]
+            all_recv_outputs.extend(splited_var)
            program.global_block().append_op(
                type="recv",
-                inputs={"X": [grad_send_dummy_out]},
+                inputs={"X": [recv_dep_in]},
                outputs={"Out": splited_var},
                attrs={
                    "epmap": eps,
@@ -327,10 +335,11 @@ class DistributeTranspiler(object):
                })
        if self.sync_mode:
+            # form a WAW dependency
            program.global_block().append_op(
                type="fetch_barrier",
                inputs={},
-                outputs={},
+                outputs={"Out": all_recv_outputs},
                attrs={
                    "endpoints": pserver_endpoints,
                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
@@ -414,10 +423,12 @@ class DistributeTranspiler(object):
                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                })
+        fetch_barrier_out = startup_program.global_block().create_var(
+            name=framework.generate_control_dev_var_name())
        startup_program.global_block().append_op(
            type="fetch_barrier",
            inputs={},
-            outputs={},
+            outputs={"Out": fetch_barrier_out},
            attrs={
                "endpoints": self.pserver_endpoints,
                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE