Merge remote-tracking branch 'ups/develop' into refine/ut/lac

06180779 · tensor-tang · 6b104c90 · 6de0a18d · 06180779 · 06180779
63 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -213,9 +213,11 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
+endif()
+if(WITH_MKL OR WITH_MKLML)
    include(external/anakin)
 elseif()
-    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()
 include(generic)            # simplify cmake module

--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -16,16 +16,6 @@ set(ANAKIN_LIBRARY     ${ANAKIN_INSTALL_DIR})
 set(ANAKIN_SHARED_LIB  ${ANAKIN_LIBRARY}/libanakin.so)
 set(ANAKIN_SABER_LIB   ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
-# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
-set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
-set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
-set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
 include_directories(${ANAKIN_INCLUDE})
 include_directories(${ANAKIN_INCLUDE}/saber/)
 include_directories(${ANAKIN_INCLUDE}/saber/core/)
@@ -48,6 +38,11 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
    -Wno-reorder
    -Wno-error=cpp)
+if(WITH_GPU)
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=YES -DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR})
+else()
+    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=NO)
+endif()
 ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -56,13 +51,11 @@ ExternalProject_Add(
    GIT_TAG             "9424277cf9ae180a14aff09560d3cd60a49c76d2"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DUSE_GPU_PLACE=YES
+    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
                        -DUSE_X86_PLACE=YES
                        -DBUILD_WITH_UNIT_TEST=NO
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                        -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-                        -DCUDNN_ROOT=${CUDNN_ROOT}
-                        -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
                        -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
                        ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -145,7 +145,7 @@ copy(memory_lib
 set(inference_deps paddle_fluid_shared paddle_fluid)
 set(module "inference/api")
-if (WITH_ANAKIN AND WITH_GPU)
+if (WITH_ANAKIN AND WITH_MKL)
    copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
        SRCS
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api

--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -822,6 +822,14 @@ pad
 ..  autofunction:: paddle.fluid.layers.pad
    :noindex:
+.. _api_fluid_layers_pad_constant_like:
+pad_constant_like
+---
+..  autofunction:: paddle.fluid.layers.pad_constant_like
+    :noindex:
 .. _api_fluid_layers_label_smooth:
 label_smooth
@@ -1145,6 +1153,14 @@ sigmoid
 ..  autofunction:: paddle.fluid.layers.sigmoid
    :noindex:
+.. _api_fluid_layers_hsigmoid:
+hsigmoid
+-------
+..  autofunction:: paddle.fluid.layers.hsigmoid
+    :noindex:
 .. _api_fluid_layers_logsigmoid:
 logsigmoid

--- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
+++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
@@ -104,6 +104,7 @@ visualDL --logdir=scratch_log --port=8080
 # 访问 http://127.0.0.1:8080
 ```
+如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上，运行`pip install --upgrade protobuf`就能解决。
 如果在虚拟环境下仍然遇到安装问题，请尝试以下方法。

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -43,6 +43,7 @@ paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list',
 paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
+paddle.fluid.Trainer.save_inference_model ArgSpec(args=['self', 'param_path', 'feeded_var_names', 'target_var_indexes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -13,13 +13,10 @@
 // limitations under the License.
 #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/api/helper.h"
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
 #include <string>
 #include "paddle/fluid/framework/lod_tensor.h"

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -85,7 +85,7 @@ void GraphPatternDetector::operator()(Graph* graph,
  LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
  int id = 0;
  for (auto& g : subgraphs) {
-    LOG(INFO) << "optimizing #" << id++ << " subgraph";
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
    handler(g, graph);
  }
 }

--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -50,20 +50,37 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
  Dot dot;
-  std::vector<Dot::Attr> op_attrs({Dot::Attr("style", "filled"),
+  const std::vector<Dot::Attr> op_attrs({
-                                   Dot::Attr("shape", "box"),
+      Dot::Attr("style", "rounded,filled,bold"),  //
-                                   Dot::Attr("fillcolor", "red")});
+      Dot::Attr("shape", "box"),                  //
-  std::vector<Dot::Attr> var_attrs({Dot::Attr("style", "filled,rounded"),
+      Dot::Attr("color", "#303A3A"),              //
-                                    // Dot::Attr("shape", "diamond"),
+      Dot::Attr("fontcolor", "#ffffff"),          //
-                                    Dot::Attr("fillcolor", "yellow")});
+      Dot::Attr("width", "1.3"),                  //
+      Dot::Attr("height", "0.84"),                //
-  std::vector<Dot::Attr> marked_op_attrs({Dot::Attr("style", "filled"),
+      Dot::Attr("fontname", "Arial"),             //
-                                          Dot::Attr("shape", "box"),
+  });
-                                          Dot::Attr("fillcolor", "lightgray")});
+  const std::vector<Dot::Attr> arg_attrs({
-  std::vector<Dot::Attr> marked_var_attrs(
+      Dot::Attr("shape", "box"),                  //
-      {Dot::Attr("style", "filled,rounded"),
+      Dot::Attr("style", "rounded,filled,bold"),  //
-       // Dot::Attr("shape", "diamond"),
+      Dot::Attr("fontname", "Arial"),             //
-       Dot::Attr("fillcolor", "lightgray")});
+      Dot::Attr("fillcolor", "#999999"),          //
+      Dot::Attr("color", "#dddddd"),              //
+  });
+  const std::vector<Dot::Attr> param_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("color", "#148b97"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+  });
+  const std::vector<Dot::Attr> marked_op_attrs(
+      {Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
+  const std::vector<Dot::Attr> marked_var_attrs(
+      {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
  auto marked_nodes = ConsumeMarkedNodes(graph.get());
  // Create nodes
@@ -74,9 +91,17 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
          marked_nodes.count(n) ? marked_op_attrs : op_attrs;
      dot.AddNode(node_id, attr, node_id);
    } else if (n->IsVar()) {
-      decltype(op_attrs) attr =
+      decltype(op_attrs)* attr;
-          marked_nodes.count(n) ? marked_var_attrs : var_attrs;
+      if (marked_nodes.count(n)) {
-      dot.AddNode(node_id, attr, node_id);
+        attr = &marked_var_attrs;
+      } else if (const_cast<Node*>(n)->Var() &&
+                 const_cast<Node*>(n)->Var()->Persistable()) {
+        attr = &param_attrs;
+      } else {
+        attr = &arg_attrs;
+      }
+      dot.AddNode(node_id, *attr, node_id);
    }
    node2dot[n] = node_id;
  }

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -6,6 +6,7 @@ cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits
  analyzer.cc
  helper.cc
  # passes
+  analysis_pass.cc
  fluid_to_data_flow_graph_pass.cc
  data_flow_graph_to_fluid_pass.cc
  dfg_graphviz_draw_pass.cc
@@ -99,12 +100,17 @@ inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
 set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
+set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
 if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
+  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz")
 endif()
-inference_analysis_test(test_text_classification SRCS test_text_classification.cc
+inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
-    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta)
+    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
+         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
+         --topn=1 # Just run top 1 batch.
+    )
--- a/paddle/fluid/inference/analysis/pass.cc
+++ b/paddle/fluid/inference/analysis/pass.cc
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -28,10 +28,10 @@ namespace paddle {
 namespace inference {
 namespace analysis {
-class Pass {
+class AnalysisPass {
 public:
-  Pass() = default;
+  AnalysisPass() = default;
-  virtual ~Pass() = default;
+  virtual ~AnalysisPass() = default;
  // Mutable Pass.
  virtual bool Initialize(Argument *argument) { return false; }
  // Readonly Pass.
@@ -42,23 +42,16 @@ class Pass {
  virtual bool Finalize() { return false; }
  // Get a Pass appropriate to print the Node this pass operates on.
-  virtual Pass *CreatePrinterPass(std::ostream &os,
+  virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
-                                  const std::string &banner) const {
+                                          const std::string &banner) const {
    return nullptr;
  }
  // Create a debugger Pass that draw the DFG by graphviz toolkit.
-  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
+  virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
-  virtual void Run() { LOG(FATAL) << "not valid"; }
-  // Run on a single Node.
-  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single Function.
-  virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single FunctionBlock.
-  virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
  // Run on a single DataFlowGraph.
-  virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+  virtual void Run(DataFlowGraph *x) = 0;
  // Human-readable short representation.
  virtual std::string repr() const = 0;
@@ -66,29 +59,8 @@ class Pass {
  virtual std::string description() const { return "No DOC"; }
 };
-// NodePass process on any Node types.
-class NodePass : public Pass {
- public:
-  virtual void Run(Node *node) = 0;
-};
-// NodePass process on any Function node types.
-class FunctionPass : public Pass {
- public:
-  virtual void Run(Function *node) = 0;
-};
-// NodePass process on any FunctionBlock node types.
-class FunctionBlockPass : public Pass {
- public:
-  virtual void Run(FunctionBlock *node) = 0;
-};
 // GraphPass processes on any GraphType.
-class DataFlowGraphPass : public Pass {
+class DataFlowGraphPass : public AnalysisPass {};
- public:
-  virtual void Run(DataFlowGraph *graph) = 0;
-};
 }  // namespace analysis
 }  // namespace inference

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
@@ -58,7 +59,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
  std::string description() const override { return "DFG pass manager."; }
 private:
-  void AddPass(const std::string& name, Pass* pass) {
+  void AddPass(const std::string& name, AnalysisPass* pass) {
    VLOG(3) << "Adding pass " << name;
    Register(name, pass);
    AddGraphvizDebugerPass(pass);
@@ -87,7 +88,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
  }
  // Add the graphviz debuger pass if the parent pass has one.
-  void AddGraphvizDebugerPass(Pass* pass) {
+  void AddGraphvizDebugerPass(AnalysisPass* pass) {
    auto* debuger_pass = pass->CreateGraphvizDebugerPass();
    if (debuger_pass) {
      Register(debuger_pass->repr(), debuger_pass);
@@ -106,7 +107,6 @@ void Analyzer::Run(Argument* argument) {
    }
  }
  passes.push_back("graph_viz_pass");
-  // Ugly support fluid-to-ir-pass
  argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
  for (auto& x : data_) {

--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -38,8 +38,9 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <string>
 #include <vector>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/flags.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 namespace paddle {

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -16,6 +16,7 @@
 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
@@ -24,12 +25,12 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/profiler.h"
 DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
 DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
 DEFINE_int32(batch_size, 10, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 namespace paddle {
 namespace inference {
@@ -220,39 +221,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }
-std::string DescribeTensor(const PaddleTensor &tensor) {
-  std::stringstream os;
-  os << "Tensor [" << tensor.name << "]\n";
-  os << " - type: ";
-  switch (tensor.dtype) {
-    case PaddleDType::FLOAT32:
-      os << "float32";
-      break;
-    case PaddleDType::INT64:
-      os << "int64";
-      break;
-    default:
-      os << "unset";
-  }
-  os << '\n';
-  os << " - shape: " << to_string(tensor.shape) << '\n';
-  os << " - lod: ";
-  for (auto &l : tensor.lod) {
-    os << to_string(l) << "; ";
-  }
-  os << "\n";
-  os << " - data: ";
-  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
-                            [](int a, int b) { return a * b; });
-  for (int i = 0; i < dim; i++) {
-    os << static_cast<float *>(tensor.data.data())[i] << " ";
-  }
-  os << '\n';
-  return os.str();
-}
 }  // namespace
 const float ditu_rnn_target_data[] = {
@@ -266,11 +234,29 @@ const float ditu_rnn_target_data[] = {
    10.7286, 12.0595, 10.6672, 0,       0,       0,       0,       0,
    93.5771, 3.84641, 0,       0,       0,       0,       0,       0,
    169.426, 0,       0,       0,       0,       0,       0,       0};
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &base_outputs) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &base_out = base_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+                                   1, [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_EQ(size, size1);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    float *base_data = static_cast<float *>(base_out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    }
+  }
+}
 // Test with a really complicate model.
-void TestDituRNNPrediction(const std::string &model_path,
+void TestDituRNNPrediction(bool use_analysis, bool activate_ir,
-                           const std::string &data_path, int batch_size,
+                           int num_threads) {
-                           bool use_analysis, bool activate_ir,
-                           int num_times = 1) {
  AnalysisConfig config;
  config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
  config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
@@ -281,6 +267,8 @@ void TestDituRNNPrediction(const std::string &model_path,
  PADDLE_ENFORCE(config.ir_mode ==
                 AnalysisConfig::IrPassMode::kExclude);  // default
  config.ir_passes.clear();  // Do not exclude any pass.
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
  auto base_predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
@@ -288,40 +276,55 @@ void TestDituRNNPrediction(const std::string &model_path,
      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
          config);
  std::vector<PaddleTensor> input_slots;
-  DataRecord data(data_path, batch_size);
+  DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
  // Prepare inputs.
  PrepareInputs(&input_slots, &data, batch_size);
  std::vector<PaddleTensor> outputs, base_outputs;
  base_predictor->Run(input_slots, &base_outputs);
-  Timer timer;
-  timer.tic();
-  for (int i = 0; i < num_times; i++) {
-    predictor->Run(input_slots, &outputs);
-  }
  LOG(INFO) << "===========profile result===========";
-  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
+  if (num_threads == 1) {
-            << ", latency: " << timer.toc() / num_times << "ms";
+    // Prepare inputs.
-  LOG(INFO) << "=====================================";
+    Timer timer;
+    timer.tic();
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
+    for (int i = 0; i < num_times; i++) {
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+      predictor->Run(input_slots, &outputs);
-  for (size_t i = 0; i < outputs.size(); i++) {
+    }
-    auto &out = outputs[i];
+    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
-    auto &base_out = base_outputs[i];
+    CompareResult(outputs, base_outputs);
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+  } else {
-                                  [](int a, int b) { return a * b; });
+    std::vector<std::thread> threads;
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-                                   1, [](int a, int b) { return a * b; });
+    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
-    PADDLE_ENFORCE_EQ(size, size1);
+    // because AttentionLSTM's hard code nodeid will be damanged.
-    PADDLE_ENFORCE_GT(size, 0);
+    for (int tid = 0; tid < num_threads; ++tid) {
-    float *data = static_cast<float *>(out.data.data());
+      predictors.emplace_back(
-    float *base_data = static_cast<float *>(base_out.data.data());
+          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-    for (size_t j = 0; j < size; j++) {
+              config));
-      EXPECT_NEAR(data[j], base_data[j], 1e-3);
+    }
+    for (int tid = 0; tid < num_threads; ++tid) {
+      threads.emplace_back([&, tid]() {
+        // Each thread should have local input_slots and outputs.
+        std::vector<PaddleTensor> input_slots;
+        DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
+        PrepareInputs(&input_slots, &data, batch_size);
+        std::vector<PaddleTensor> outputs;
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          predictors[tid]->Run(input_slots, &outputs);
+        }
+        PrintTime(batch_size, num_times, num_threads, tid,
+                  timer.toc() / num_times);
+        CompareResult(outputs, base_outputs);
+      });
+    }
+    for (int i = 0; i < num_threads; ++i) {
+      threads[i].join();
    }
  }
+  LOG(INFO) << "=====================================";
  if (use_analysis && activate_ir) {
    AnalysisPredictor *analysis_predictor =
@@ -350,25 +353,26 @@ void TestDituRNNPrediction(const std::string &model_path,
  }
 }
-// Directly infer with the original model.
+// Inference with analysis and IR, easy for profiling independently.
-TEST(Analyzer, DituRNN_without_analysis) {
+TEST(Analyzer, DituRNN) {
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+  TestDituRNNPrediction(true, true, FLAGS_num_threads);
-                        FLAGS_batch_size, false, false, FLAGS_repeat);
 }
-// Inference with the original model with the analysis turned on, the analysis
+// Other unit-tests of DituRNN, test different options of use_analysis,
-// module will transform the program to a data flow graph.
+// activate_ir and multi-threads.
-TEST(Analyzer, DituRNN_with_analysis) {
+TEST(Analyzer, DituRNN_tests) {
-  LOG(INFO) << "ditu rnn with analysis";
+  int num_threads[2] = {1, 4};
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+  for (auto i : num_threads) {
-                        FLAGS_batch_size, true, false, FLAGS_repeat);
+    // Directly infer with the original model.
-}
+    TestDituRNNPrediction(false, false, i);
+    // Inference with the original model with the analysis turned on, the
-// Inference with analysis and IR. The IR module will fuse some large kernels.
+    // analysis
-TEST(Analyzer, DituRNN_with_analysis_with_IR) {
+    // module will transform the program to a data flow graph.
-  LOG(INFO) << "ditu rnn with analysis and IR fuse";
+    TestDituRNNPrediction(true, false, i);
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+    // Inference with analysis and IR. The IR module will fuse some large
-                        FLAGS_batch_size, true, true, FLAGS_repeat);
+    // kernels.
+    TestDituRNNPrediction(true, true, i);
+  }
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/test_text_classification.cc
+++ b/paddle/fluid/inference/analysis/test_text_classification.cc
@@ -12,19 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
 #include <gtest/gtest.h>
+#include <fstream>
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/api/timer.h"
 DEFINE_string(infer_model, "", "Directory of the inference model.");
 DEFINE_string(infer_data, "", "Path of the dataset.");
 DEFINE_int32(batch_size, 1, "batch size.");
 DEFINE_int32(repeat, 1, "How many times to repeat run.");
+DEFINE_int32(topn, -1, "Run top n batches of data to save time");
 namespace paddle {
@@ -44,41 +48,67 @@ void PrintTime(const double latency, const int bs, const int repeat) {
  LOG(INFO) << "=====================================";
 }
-void Main(int batch_size) {
+struct DataReader {
-  // Three sequence inputs.
+  DataReader(const std::string &path) : file(new std::ifstream(path)) {}
-  std::vector<PaddleTensor> input_slots(1);
-  // one batch starts
+  bool NextBatch(PaddleTensor *tensor, int batch_size) {
-  // data --
+    PADDLE_ENFORCE_EQ(batch_size, 1);
-  int64_t data0[] = {0, 1, 2};
+    std::string line;
-  for (auto &input : input_slots) {
+    tensor->lod.clear();
-    input.data.Reset(data0, sizeof(data0));
+    tensor->lod.emplace_back(std::vector<size_t>({0}));
-    input.shape = std::vector<int>({3, 1});
+    std::vector<int64_t> data;
-    // dtype --
-    input.dtype = PaddleDType::INT64;
+    for (int i = 0; i < batch_size; i++) {
-    // LoD --
+      if (!std::getline(*file, line)) return false;
-    input.lod = std::vector<std::vector<size_t>>({{0, 3}});
+      inference::split_to_int64(line, ' ', &data);
+    }
+    tensor->lod.front().push_back(data.size());
+    tensor->data.Resize(data.size() * sizeof(int64_t));
+    memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
+    tensor->shape.clear();
+    tensor->shape.push_back(data.size());
+    tensor->shape.push_back(1);
+    return true;
  }
+  std::unique_ptr<std::ifstream> file;
+};
+void Main(int batch_size) {
  // shape --
  // Create Predictor --
  AnalysisConfig config;
  config.model_dir = FLAGS_infer_model;
  config.use_gpu = false;
  config.enable_ir_optim = true;
-  config.ir_passes.push_back("fc_lstm_fuse_pass");
  auto predictor =
      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
          config);
+  std::vector<PaddleTensor> input_slots(1);
+  // one batch starts
+  // data --
+  auto &input = input_slots[0];
+  input.dtype = PaddleDType::INT64;
  inference::Timer timer;
  double sum = 0;
  std::vector<PaddleTensor> output_slots;
-  for (int i = 0; i < FLAGS_repeat; i++) {
-    timer.tic();
+  int num_batches = 0;
-    CHECK(predictor->Run(input_slots, &output_slots));
+  for (int t = 0; t < FLAGS_repeat; t++) {
-    sum += timer.toc();
+    DataReader reader(FLAGS_infer_data);
+    while (reader.NextBatch(&input, FLAGS_batch_size)) {
+      if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
+      timer.tic();
+      CHECK(predictor->Run(input_slots, &output_slots));
+      sum += timer.toc();
+      ++num_batches;
+    }
  }
-  PrintTime(sum, batch_size, FLAGS_repeat);
+  PrintTime(sum, batch_size, num_batches);
  // Get output
  LOG(INFO) << "get outputs " << output_slots.size();
@@ -100,10 +130,3 @@ void Main(int batch_size) {
 TEST(text_classification, basic) { Main(FLAGS_batch_size); }
 }  // namespace paddle
-USE_PASS(fc_fuse_pass);
-USE_PASS(seq_concat_fc_fuse_pass);
-USE_PASS(fc_lstm_fuse_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(attention_lstm_fuse_pass);
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -263,7 +263,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace
-Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
      FLAGS_IA_graphviz_log_root,
      "data_flow_graph_to_fluid_graphviz_debugger"));

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -21,8 +21,8 @@
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 namespace paddle {
 namespace inference {
@@ -42,7 +42,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
    return "Transform a DFG to a Fluid ProgramDesc";
  }
-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;
 protected:
  // Add a Fluid Op into the ProgramDesc.

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include <fstream>
 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/dot.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -66,7 +66,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }
-Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
      FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
 }

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -22,8 +22,8 @@
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 namespace paddle {
 namespace inference {
@@ -46,7 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
    return "transform a fluid ProgramDesc to a data flow graph.";
  }
-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;
 private:
  framework::proto::ProgramDesc const *desc_;

--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
@@ -14,15 +14,17 @@
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
-using namespace framework;
 static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__";
@@ -48,7 +50,8 @@ class FluidToIrPass final : public DataFlowGraphPass {
    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
    // Load program.
    auto program = LoadProgramDesc(*argument->fluid_model_program_path);
-    argument->origin_program_desc.reset(new proto::ProgramDesc(program));
+    argument->origin_program_desc.reset(
+        new framework::proto::ProgramDesc(program));
    // Create main data flow graph.
    if (!argument->main_dfg) {
      argument->main_dfg.reset(new DataFlowGraph);
@@ -78,12 +81,13 @@ class FluidToIrPass final : public DataFlowGraphPass {
    IRPassManager ir_passes(argument_->Get<ProgramDesc>("ir_program_desc"),
                            nullptr);
    // Pass the scope from analysis to IR if needed.
-    if (argument_->Has(ir::kParamScopeAttr)) {
+    if (argument_->Has(framework::ir::kParamScopeAttr)) {
      // Here the address is passed, attention that IR doesn't own the scope, so
      // the real scope in analysis should live during the IR phase.
      ir_passes.graph().Set(
-          ir::kParamScopeAttr,
+          framework::ir::kParamScopeAttr,
-          new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr)));
+          new framework::Scope *(&argument_->Get<framework::Scope>(
+              framework::ir::kParamScopeAttr)));
    }
    if (FLAGS_IA_enable_ir) {
@@ -95,12 +99,12 @@ class FluidToIrPass final : public DataFlowGraphPass {
    PADDLE_ENFORCE(argument_->main_dfg.get());
    argument_->main_dfg->Build(ir_passes.graph());
    // inherit the arguments from ir.
-    if (ir_passes.graph().Has(ir::kFuseStatisAttr)) {
+    if (ir_passes.graph().Has(framework::ir::kFuseStatisAttr)) {
      argument_->Set(
-          ir::kFuseStatisAttr,
+          framework::ir::kFuseStatisAttr,
          new std::unordered_map<std::string, int>(
              ir_passes.graph().Get<std::unordered_map<std::string, int>>(
-                  ir::kFuseStatisAttr)));
+                  framework::ir::kFuseStatisAttr)));
    }
  }
@@ -112,7 +116,7 @@ class FluidToIrPass final : public DataFlowGraphPass {
 private:
  // Load parameters from a single file or from a directory.
-  bool LoadParams(Scope *scope, const std::string &dir,
+  bool LoadParams(framework::Scope *scope, const std::string &dir,
                  const std::string &prog_file, const std::string &param_file);
 private:

--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -19,7 +19,7 @@
 #pragma once
 #include <string>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -40,17 +40,6 @@ void DfgPassManager::RunAll() {
  }
 }
-void NodePassManager::RunAll() {
-  PADDLE_ENFORCE(argument_);
-  PADDLE_ENFORCE(argument_->main_dfg.get());
-  auto trait = GraphTraits<DataFlowGraph>(*argument_->main_dfg).nodes_in_DFS();
-  for (auto& node : trait) {
-    for (auto& pass : data_) {
-      pass->Run(&node);
-    }
-  }
-}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -33,7 +33,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 namespace paddle {
 namespace inference {
@@ -43,7 +43,7 @@ namespace analysis {
 * PassManager is the base class for all pass managers, a pass manager has
 * several Pass-es registered, and execute them in the linear order.
 */
-class PassManager : public OrderedRegistry<Pass> {
+class PassManager : public OrderedRegistry<AnalysisPass> {
 public:
  PassManager() = default;
  // Call all the passes' Initialize methods. The desc and data_flow_graph are
@@ -89,18 +89,6 @@ class DfgPassManager : public PassManager {
  virtual ~DfgPassManager() = default;
 };
-/*
- * A pass manager that process a Node each time.
- */
-class NodePassManager : public PassManager {
- public:
-  NodePassManager() = default;
-  void RunAll() override;
-  virtual ~NodePassManager() = default;
-};
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -34,28 +34,6 @@ class TestDfgPassManager final : public DfgPassManager {
  std::string description() const override { return "test doc"; }
 };
-class TestNodePassManager final : public NodePassManager {
- public:
-  virtual ~TestNodePassManager() = default;
-  std::string repr() const override { return "test-node-pass-manager"; }
-  std::string description() const override { return "test doc"; }
-};
-class TestNodePass final : public NodePass {
- public:
-  virtual ~TestNodePass() = default;
-  bool Initialize(Argument* argument) override { return true; }
-  void Run(Node* node) override {
-    LOG(INFO) << "- Processing node " << node->repr();
-  }
-  std::string repr() const override { return "test-node"; }
-  std::string description() const override { return "some doc"; }
-};
 TEST(PassManager, DFG_pass_manager) {
  TestDfgPassManager manager;
  DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
@@ -71,19 +49,6 @@ TEST(PassManager, DFG_pass_manager) {
  manager.RunAll();
 }
-TEST(PassManager, Node_pass_manager) {
-  Argument argument(FLAGS_inference_model_dir);
-  // Pre-process: initialize the DFG with the ProgramDesc first.
-  FluidToDataFlowGraphPass pass0;
-  pass0.Initialize(&argument);
-  pass0.Run(argument.main_dfg.get());
-  TestNodePassManager manager;
-  manager.Register("test-node-pass", new TestNodePass);
-  ASSERT_TRUE(manager.Initialize(&argument));
-  manager.RunAll();
-}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -68,7 +68,7 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
  }
 };
-Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
  DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
                                      "tensorrt_marked_node");
  return new DfgDebuggerPass(config);

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -20,7 +20,7 @@
 #pragma once
 #include <string>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 namespace paddle {
@@ -48,7 +48,7 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
    return "tensorrt sub-graph mark pass";
  }
-  Pass* CreateGraphvizDebugerPass() const override;
+  AnalysisPass* CreateGraphvizDebugerPass() const override;
  bool Finalize() override;
 private:

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/node.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 namespace paddle {

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -44,20 +44,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
-          analysis
-          ir_pass_manager
-          pass
-          fc_fuse_pass
-          fc_lstm_fuse_pass
-          fc_gru_fuse_pass
-          seq_concat_fc_fuse_pass
-          graph_viz_pass
-          infer_clean_graph_pass
-          graph_pattern_detector
-          infer_clean_graph_pass
-          attention_lstm_fuse_pass
-)
 cc_test(test_paddle_inference_api
        SRCS api_tester.cc
@@ -74,7 +61,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()
-if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
+if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
@@ -84,12 +71,24 @@ if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
    anakin_target(inference_anakin_api)
    anakin_target(inference_anakin_api_shared)
    if (WITH_TESTING)
-        cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
+        # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
-                ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
+        set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+        set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
+        set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
+        execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
+        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
+        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
+        if(WITH_GPU)
+            set(anakin_test_extra_deps dynload_cuda)
+            set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
+            execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
+            cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
+                    ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
+                    DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
+        endif()
        cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc 
                ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
                     --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
-                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+                DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
    endif(WITH_TESTING)
 endif()
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -48,7 +48,6 @@ bool AnalysisPredictor::Init(
  } else {
    place_ = paddle::platform::CPUPlace();
  }
-  PADDLE_ENFORCE(!parent_scope);
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());

--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -193,7 +193,9 @@ PaddleInferenceAnakinPredictor<Target>::Clone() {
  return std::move(cls);
 }
+#ifdef PADDLE_WITH_CUDA
 template class PaddleInferenceAnakinPredictor<anakin::NV>;
+#endif
 template class PaddleInferenceAnakinPredictor<anakin::X86>;
 // A factory to help create difference predictor.
@@ -202,10 +204,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
  VLOG(3) << "Anakin Predictor create.";
  if (config.target_type == AnakinConfig::NVGPU) {
+#ifdef PADDLE_WITH_CUDA
    VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
    std::unique_ptr<PaddlePredictor> x(
        new PaddleInferenceAnakinPredictor<anakin::NV>(config));
    return x;
+#else
+    LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment";
+    return nullptr;
+#endif
  } else if (config.target_type == AnakinConfig::X86) {
    VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
    std::unique_ptr<PaddlePredictor> x(

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <glog/logging.h>
 #include <sys/time.h>
 #include <algorithm>
 #include <numeric>
@@ -88,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor,
  }
 }
+std::string DescribeTensor(const PaddleTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name << "]\n";
+  os << " - type: ";
+  switch (tensor.dtype) {
+    case PaddleDType::FLOAT32:
+      os << "float32";
+      break;
+    case PaddleDType::INT64:
+      os << "int64";
+      break;
+    default:
+      os << "unset";
+  }
+  os << '\n';
+  os << " - shape: " << to_string(tensor.shape) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  for (int i = 0; i < dim; i++) {
+    os << static_cast<float *>(tensor.data.data())[i] << " ";
+  }
+  os << '\n';
+  return os.str();
+}
+void PrintTime(int batch_size, int repeat, int num_threads, int tid,
+               double latency) {
+  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat
+            << ", threads: " << num_threads << ", thread id: " << tid
+            << ", latency: " << latency << "ms";
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -119,7 +119,8 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
                  const framework::Tensor& last_scale,
                  const framework::Tensor& iter, const int window_size,
                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
-    auto& gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);

--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -157,6 +157,116 @@ class FlattenGradOp : public framework::OperatorBase {
  }
 };
+// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
+// the XShape is used to carry the shape and lod of X which will be used in
+// flatten_grad, in this way, the framework can reuse the memory of X
+// immediately the flatten2_op is finished.
+// Considering compatibility issues, we could not fix flatten2_op
+class Flatten2OpInferShape : public FlattenOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    FlattenOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output (XShape) of Flatten op should not be null.");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      xshape_dims[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", "XShape");
+  }
+};
+class Flatten2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axis = Attr<int>("axis");
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
+    framework::AttributeMap attrs;
+    attrs["shape"] = out_dims;
+    attrs["inplace"] = false;
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+class Flatten2OpMaker : public FlattenOpMaker {
+ public:
+  void Make() override {
+    FlattenOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("flatten2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class Flatten2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+class Flatten2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = false;
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -167,3 +277,8 @@ REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
                  ops::FlattenOpInferShape,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
+REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
+                  ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker);
+REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
+                  ops::Flatten2GradInferShape);
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -30,14 +30,7 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                 "Input(WeightX) of GRU should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
                 "Input(WeightH) of GRU should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                 "Output(ReorderedH0) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                 "Output(BatchedInput) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
-                 "Output(BatchedOut) of GRU should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                 "Output(Hidden) of GRU should not be null.");
@@ -80,15 +73,20 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
  }
  framework::DDim out_dims({x_dims[0], frame_size});
  ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchedOut", out_dims);
  ctx->ShareLoD("X", "Hidden");
  int xx_width;
  if (ctx->Attrs().Get<bool>("use_seq")) {
    xx_width = wx_dims[1];
  } else {
    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Output(ReorderedH0) of GRU should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Output(BatchedInput) of GRU should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
+                   "Output(BatchedOut) of GRU should not be null.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedOut", out_dims);
  }
  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
  ctx->ShareLoD("X", "XX");

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -38,16 +38,6 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                 "Output(Hidden) of LSTM should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
                 "Output(Cell) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                 "Output(BatchedInput) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
-                 "Output(BatchedHidden) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
-                 "Output(BatchedCell) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                 "Output(ReorderedH0) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
-                 "Output(ReorderedC0) of LSTM should not be null.");
  auto x_dims = ctx->GetInputDim("X");
  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
@@ -88,28 +78,36 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
  PADDLE_ENFORCE_EQ(b_dims[0], 1,
                    "The first dimension of Input(Bias) should be 1.");
+  PADDLE_ENFORCE_EQ(
-  auto use_peepholes = ctx->Attrs().Get<bool>("use_peepholes");
+      b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
-  PADDLE_ENFORCE_EQ(b_dims[1], (use_peepholes ? 7 : 4) * frame_size,
+      "The second dimension of Input(Bias) should be "
-                    "The second dimension of Input(Bias) should be "
+      "7 * %d if enable peepholes connection or"
-                    "7 * %d if enable peepholes connection or"
+      "4 * %d if disable peepholes",
-                    "4 * %d if disable peepholes",
+      frame_size, frame_size);
-                    frame_size, frame_size);
  framework::DDim out_dims({x_dims[0], frame_size});
  ctx->SetOutputDim("Hidden", out_dims);
  ctx->SetOutputDim("Cell", out_dims);
-  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchedHidden", out_dims);
-  ctx->SetOutputDim("BatchedCell", out_dims);
  ctx->ShareLoD("X", "Hidden");
  ctx->ShareLoD("X", "Cell");
  int xx_width;
  if (ctx->Attrs().Get<bool>("use_seq")) {
    xx_width = wx_dims[1];
  } else {
    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Output(BatchedInput) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
+                   "Output(BatchedHidden) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
+                   "Output(BatchedCell) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Output(ReorderedH0) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
+                   "Output(ReorderedC0) of LSTM should not be null.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedHidden", out_dims);
+    ctx->SetOutputDim("BatchedCell", out_dims);
  }
  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
  ctx->ShareLoD("X", "XX");
@@ -232,18 +230,18 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    act_cand = act_functor(act_cand_str);                                      \
  }
-#define INIT_BASE_INPUT_OUTPUT                          \
+#define INIT_BASE_INPUT_OUTPUT                        \
-  auto* x = ctx.Input<LoDTensor>("X");                  \
+  auto* x = ctx.Input<LoDTensor>("X");                \
-  auto* h0 = ctx.Input<Tensor>("H0");                   \
+  auto* h0 = ctx.Input<Tensor>("H0");                 \
-  auto* c0 = ctx.Input<Tensor>("C0");                   \
+  auto* c0 = ctx.Input<Tensor>("C0");                 \
-  auto* wx = ctx.Input<Tensor>("WeightX");              \
+  auto* wx = ctx.Input<Tensor>("WeightX");            \
-  auto* wh = ctx.Input<Tensor>("WeightH");              \
+  auto* wh = ctx.Input<Tensor>("WeightH");            \
-  auto* bias = ctx.Input<Tensor>("Bias");               \
+  auto* bias = ctx.Input<Tensor>("Bias");             \
-  auto* xx = ctx.Output<LoDTensor>("XX");               \
+  auto* xx = ctx.Output<LoDTensor>("XX");             \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");   \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
-  auto* cell_out = ctx.Output<LoDTensor>("Cell");       \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");     \
-  bool use_peepholes = ctx.Attr<bool>("use_peepholes"); \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes");
 #define INIT_BASE_SIZES                  \
  auto x_dims = x->dims();   /* T x M*/  \
@@ -254,172 +252,183 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  const int D3 = D * 3;                  \
  const int D4 = wh_dims[1];
+#define INIT_BASE_INPUT_DATAS                                        \
+  const T* x_data = x->data<T>();                                    \
+  const T* wx_data = wx->data<T>();                                  \
+  const T* wh_data = wh->data<T>();                                  \
+  /* diagonal weight*/                                               \
+  const T* wc_data = bias->data<T>() + D4;                           \
+  /* for peephole only*/                                             \
+  Tensor checked_cell;                                               \
+  T* checked_cell_data = nullptr;                                    \
+  auto place = ctx.GetPlace();                                       \
+  if (use_peepholes) {                                               \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                 \
+    checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
+  }
+/// Compute LSTM
+#define GEMM_WH_ADDON(bs, prev, out)                                           \
+  blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
+            wh_data, D4, static_cast<T>(1), out, D4)
+// gates: W_ch, W_ih, W_fh, W_oh
+#define GET_Ct(ct_1, gates, ct)                   \
+  /* C_t = C_t-1 * fgated + cand_gated * igated*/ \
+  act_cand(D, gates, gates);                      \
+  blas.VMUL(D, gates, gates + D, gates + D);      \
+  blas.VMUL(D, ct_1, gates + D2, gates + D2);     \
+  blas.VADD(D, gates + D, gates + D2, ct)
+#define GET_Ht(ct, gates, ht)        \
+  /* H_t = act_cell(C_t) * ogated */ \
+  act_cell(D, ct, gates + D2);       \
+  blas.VMUL(D, gates + D2, gates + D3, ht)
+#define GET_Ct_NOH0C0(gates, ct)     \
+  /* C_t = igated * cgated*/         \
+  act_gate(D, gates + D, gates + D); \
+  act_cand(D, gates, gates);         \
+  blas.VMUL(D, gates, gates + D, ct)
+#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                \
+  act_gate(D, gates + D3, gates + D3);     \
+  GET_Ht(ct, gates, ht)
+#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                         \
+  /* get outgated, put W_oc * C_t on igated */      \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);        \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);  \
+  act_gate(D, gates + D3, gates + D3);              \
+  GET_Ht(ct, gates, ht)
+#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
+  act_gate(D3, gates + D, gates + D);     \
+  GET_Ct(ct_1, gates, ct);                \
+  GET_Ht(ct, gates, ht)
+#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht)        \
+  /* get fgated and igated*/                              \
+  blas.VMUL(D, wc_data, ct_1, checked_cell_data);         \
+  blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
+  blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
+  act_gate(D2, gates + D, gates + D);                     \
+  GET_Ct(ct_1, gates, ct);                                \
+  /* get ogated*/                                         \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);              \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);        \
+  act_gate(D, gates + D3, gates + D3);                    \
+  GET_Ht(ct, gates, ht)
  void SeqCompute(const framework::ExecutionContext& ctx) const {
    using DeviceContext = paddle::platform::CPUDeviceContext;
    INIT_BASE_INPUT_OUTPUT
    INIT_BASE_SIZES
    INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
    auto x_lod = x->lod();
    const int total_T = x_dims[0];
-    const int N = x_lod[0].size() - 1;  // batch size
+    const int N = x_lod[0].size() - 1;
-    const T* x_data = x->data<T>();
    const T* h0_data = h0 ? h0->data<T>() : nullptr;
    const T* c0_data = c0 ? c0->data<T>() : nullptr;
-    const T* bias_data = bias->data<T>();
+    T* xx_data = xx->mutable_data<T>(place);
-    const T* wc_data = bias_data + D4;  // w_ic, w_fc, w_oc
+    T* h_out_data = hidden_out->mutable_data<T>(place);
-    const T* wx_data = wx->data<T>();
+    T* c_out_data = cell_out->mutable_data<T>(place);
-    const T* wh_data = wh->data<T>();
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
-    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
-    // use local variable
-    framework::DDim check_dims({3, D});
-    Tensor checked_cell;  // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
-    auto checked_cell_data =
-        checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());
    auto blas = math::GetBlas<DeviceContext, T>(ctx);
    math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
                                      xx_data, bias->data<T>());
    int xx_offset = D4;
    int gate_offset = D;
    if (is_reverse) {
      const int offset = (total_T - 1) * D;
      xx_data = xx_data + offset * 4;
-      hidden_out_data = hidden_out_data + offset;
+      h_out_data = h_out_data + offset;
-      cell_out_data = cell_out_data + offset;
+      c_out_data = c_out_data + offset;
      xx_offset = -D4;
      gate_offset = -D;
    }
-    auto move_step = [&]() {
+#define MOVE_ONE_STEP                    \
-      xx_data = xx_data + xx_offset;
+  prev_h_data = h_out_data;              \
-      hidden_out_data = hidden_out_data + gate_offset;
+  prev_c_data = c_out_data;              \
-      cell_out_data = cell_out_data + gate_offset;
+  xx_data = xx_data + xx_offset;         \
-    };
+  h_out_data = h_out_data + gate_offset; \
+  c_out_data = c_out_data + gate_offset
-    for (int i = 0; i < N; ++i) {
-      int bid = is_reverse ? N - 1 - i : i;
+#define PROCESS_H0C0_DEFINES                       \
-      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
+  int bid = is_reverse ? N - 1 - i : i;            \
-      const T* prev_c_data = nullptr;
+  int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
-      const T* prev_h_data = nullptr;
+  const T* prev_c_data = nullptr;                  \
+  const T* prev_h_data = nullptr;                  \
-      int tstart = 0;
+  int tstart = 0
-      if (h0_data) {
-        prev_h_data = h0_data + bid * D;
+#define PROCESS_H0C0_PEEPHOLE                                      \
-        prev_c_data = c0_data + bid * D;
+  PROCESS_H0C0_DEFINES;                                            \
-      } else {
+  if (h0_data) {                                                   \
-        // If step == 0 and there is no initialized hidden state, that is to say
+    prev_h_data = h0_data + bid * D;                               \
-        // the H0 is zeros. Then W_h * H_t-1 can be skipped
+    prev_c_data = c0_data + bid * D;                               \
+  } else {                                                         \
-        // ~C_t
+    COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
-        act_cand(D, xx_data, xx_data);
+    MOVE_ONE_STEP;                                                 \
-        if (use_peepholes) {
+    tstart = 1;                                                    \
-          // I_t, F_t
+  }
-          act_gate(D2, xx_data + D, xx_data + D);
-        } else {
-          // I_t, F_t, O_t
-          act_gate(D3, xx_data + D, xx_data + D);
-        }
-        // C_t = I_t * ~C_t
-        blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
-        if (use_peepholes) {
-          // + W_oc * C_t for peephole connection
-          blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
-          blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
-          // O_t
-          act_gate(D, xx_data + D3, xx_data + D3);
-        }
-        // hidden out= act_state(cellout) * outgate
-        act_cell(D, cell_out_data, xx_data + D2);
-        // H_t = O_t * act_state(C_t)
-        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
-        // prev
-        prev_h_data = hidden_out_data;
-        prev_c_data = cell_out_data;
-        tstart = 1;
-        move_step();
-      }
-      for (int step = tstart; step < seq_len; ++step) {
-        // + W_h * H_t-1
-        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
-                  prev_h_data, D, wh_data, D4, static_cast<T>(1), xx_data, D4);
-        // ~C_t
+#define PROCESS_H0C0                                      \
-        act_cand(D, xx_data, xx_data);
+  PROCESS_H0C0_DEFINES;                                   \
+  if (h0_data) {                                          \
+    prev_h_data = h0_data + bid * D;                      \
+    prev_c_data = c0_data + bid * D;                      \
+  } else {                                                \
+    COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
+    MOVE_ONE_STEP;                                        \
+    tstart = 1;                                           \
+  }
-        if (use_peepholes) {
+    if (use_peepholes) {
-          // + W_ic|W_fc * C_t-1 for peephole connection
+      for (int i = 0; i < N; ++i) {
-          blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
+        PROCESS_H0C0_PEEPHOLE
-          blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
+        for (int step = tstart; step < seq_len; ++step) {
-          blas.VADD(D2, xx_data + D, checked_cell_data, xx_data + D);
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
-          // I_t, F_t
+          COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
-          act_gate(D2, xx_data + D, xx_data + D);
+          MOVE_ONE_STEP;
-        } else {
-          // I_t, F_t, O_t
-          act_gate(D3, xx_data + D, xx_data + D);
        }
+      }
-        // F_t * C_t-1
+    } else {
-        blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2);
+      for (int i = 0; i < N; ++i) {
-        // I_t * ~C_t
+        PROCESS_H0C0
-        blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
+        for (int step = tstart; step < seq_len; ++step) {
-        // C_t = F_t * C_t-1 + I_t * ~C_t
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
-        blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
+          COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data);
+          MOVE_ONE_STEP;
-        if (use_peepholes) {
-          // + W_oc * C_t for peephole connection
-          blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
-          blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
-          // O_t
-          act_gate(D, xx_data + D3, xx_data + D3);
        }
+      }
-        // hidden out= act_state(cellout) * outgate
+    }
-        act_cell(D, cell_out_data, xx_data + D2);
+#undef PROCESS_H0C0_DEFINES
-        // H_t = O_t * act_state(C_t)
+#undef PROCESS_H0C0_PEEPHOLE
-        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
+#undef PROCESS_H0C0
+#undef MOVE_ONE_STEP
-        // prev
-        prev_h_data = hidden_out_data;
-        prev_c_data = cell_out_data;
-        move_step();
-      }  // for each step in batch
-    }    // for each batch
  }
  void BatchCompute(const framework::ExecutionContext& ctx) const {
    using DeviceContext = platform::CPUDeviceContext;
    INIT_BASE_INPUT_OUTPUT
-    if (x->lod()[0].size() == 2) {  // batch size == 1
+    if (x->lod()[0].size() == 2) {
      SeqCompute(ctx);
      return;
    }
    INIT_BASE_SIZES
    INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
    auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
    auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
-    const T* x_data = x->data<T>();
-    const T* wx_data = wx->data<T>();
-    const T* wh_data = wh->data<T>();
-    const T* bias_data = bias->data<T>();
-    const T* wc_data = bias_data + D4;  // w_ic, w_fc, w_oc
-    auto place = ctx.GetPlace();
    T* xx_data = xx->mutable_data<T>(place);
    T* batched_input_data = batched_input->mutable_data<T>(place);
    T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
@@ -427,12 +436,6 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    hidden_out->mutable_data<T>(place);
    cell_out->mutable_data<T>(place);
-    // use local variable
-    framework::DDim check_dims({3, D});
-    Tensor checked_cell;  // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
-    auto checked_cell_data =
-        checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());
    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
@@ -454,27 +457,17 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    reordered_h0->Resize({max_bs, D});
    reordered_c0->Resize({max_bs, D});
-    T* prev_batch_h_data = nullptr;
-    T* prev_batch_c_data = nullptr;
-    T* cur_batch_in_data = batched_input_data;
-    T* cur_batch_h_out_data = batched_h_out_data;
-    T* cur_batch_c_out_data = batched_c_out_data;
-    auto move_step = [&](int bs) {
-      cur_batch_in_data += bs * D4;
-      cur_batch_c_out_data += bs * D;
-      cur_batch_h_out_data += bs * D;
-    };
    int tstart = 0;
+    T* prev_h_data = nullptr;
+    T* prev_c_data = nullptr;
    if (h0) {
      // reorder h0, c0
      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
      T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
      const T* h0_data = h0->data<T>();
      const T* c0_data = c0->data<T>();
-      prev_batch_h_data = reordered_h0_data;
+      prev_h_data = reordered_h0_data;
-      prev_batch_c_data = reordered_c0_data;
+      prev_c_data = reordered_c0_data;
      size_t sz = sizeof(T) * D;
      for (int i = 0; i < max_bs; ++i) {
        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
@@ -483,123 +476,80 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
        reordered_c0_data += D;
      }
    } else {
-      // Compute with no H0/C0
+      // compute without h0, c0
-      T* cur_in_data = cur_batch_in_data;
+      T* cur_in_data = batched_input_data;
-      T* cur_c_out_data = cur_batch_c_out_data;
+      T* cur_h_out_data = batched_h_out_data;
-      T* cur_h_out_data = cur_batch_h_out_data;
+      T* cur_c_out_data = batched_c_out_data;
+      for (int i = 0; i < max_bs; ++i) {
-      // If step == 0 and there is no initialized hidden state, that is to say
+        GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
-      // the H0 is zeros. Then W_h * H_t-1 can be skiped
-      for (int i = 0; i < max_bs; ++i) {  // iterate each data in 1st batch
-        // ~C_t
-        act_cand(D, cur_in_data, cur_in_data);
-        if (use_peepholes) {
-          // I_t, F_t
-          act_gate(D2, cur_in_data + D, cur_in_data + D);
-        } else {
-          // I_t, F_t, O_t
-          act_gate(D3, cur_in_data + D, cur_in_data + D);
-        }
-        // C_t = I_t * ~C_t
-        blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data);
        if (use_peepholes) {
-          // + W_oc * C_t for peephole connection
+          blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
-          blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
+          blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
-          blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
-                    cur_in_data + D3);
-          // O_t
-          act_gate(D, cur_in_data + D3, cur_in_data + D3);
        }
+        act_gate(D, cur_in_data + D3, cur_in_data + D3);
-        // hidden out= act_state(cellout) * outgate
+        GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
-        act_cell(D, cur_c_out_data, cur_in_data + D2);
-        // H_t = O_t * act_state(C_t)
-        blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
-        // move to next data in the same batch
        cur_in_data += D4;
        cur_c_out_data += D;
        cur_h_out_data += D;
      }
-      // move to data for next timestep
-      prev_batch_h_data = cur_batch_h_out_data;
-      prev_batch_c_data = cur_batch_c_out_data;
-      move_step(max_bs);
      tstart = 1;
+      prev_h_data = batched_h_out_data;
+      prev_c_data = batched_c_out_data;
    }
    const auto& batch_starts = batched_lod[0];
    const int max_seq_len = batch_starts.size() - 1;
-    for (int step = tstart; step < max_seq_len; ++step) {
+    const int offset = tstart * max_bs * D;
-      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+    batched_input_data = batched_input_data + offset * 4;
-      // + W_h * H_t-1
+    batched_h_out_data = batched_h_out_data + offset;
-      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast<T>(1),
+    batched_c_out_data = batched_c_out_data + offset;
-                prev_batch_h_data, D, wh_data, D4, static_cast<T>(1),
-                cur_batch_in_data, D4);
+#define DEFINE_CUR                        \
+  T* cur_in_data = batched_input_data;    \
-      T* cur_in_data = cur_batch_in_data;
+  T* cur_prev_c_data = prev_c_data;       \
-      T* cur_c_out_data = cur_batch_c_out_data;
+  T* cur_c_out_data = batched_c_out_data; \
-      T* cur_h_out_data = cur_batch_h_out_data;
+  T* cur_h_out_data = batched_h_out_data
-      T* prev_c_data = prev_batch_c_data;  // NULL if no C0 in step0
-      T* prev_h_data = prev_batch_h_data;  // NULL if no H0 in step0
+#define MOVE_ONE_BATCH  \
-      auto next_data_in_batch = [&]() {
+  cur_in_data += D4;    \
-        cur_in_data += D4;
+  cur_prev_c_data += D; \
-        cur_c_out_data += D;
+  cur_c_out_data += D;  \
-        cur_h_out_data += D;
+  cur_h_out_data += D
-        prev_c_data = prev_c_data ? prev_c_data + D : nullptr;
-        prev_h_data = prev_h_data ? prev_h_data + D : nullptr;
+#define MOVE_ONE_STEP                  \
-      };
+  prev_c_data = batched_c_out_data;    \
+  prev_h_data = batched_h_out_data;    \
-      for (int i = 0; i < cur_bs; ++i) {  // iterate each data in same batch
+  batched_c_out_data = cur_c_out_data; \
-        // ~C_t
+  batched_h_out_data = cur_h_out_data; \
-        act_cand(D, cur_in_data, cur_in_data);
+  batched_input_data = cur_in_data
-        if (use_peepholes) {
+    if (use_peepholes) {
-          // + W_ic|W_fc * C_t-1 for peephole connection
+      for (int step = tstart; step < max_seq_len; ++step) {
-          blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-          blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-          blas.VADD(D2, cur_in_data + D, checked_cell_data, cur_in_data + D);
+        DEFINE_CUR;
-          // I_t, F_t
+        for (int i = 0; i < cur_bs; ++i) {
-          act_gate(D2, cur_in_data + D, cur_in_data + D);
+          COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
-        } else {
+                                cur_h_out_data);
-          // I_t, F_t, O_t
+          MOVE_ONE_BATCH;
-          act_gate(D3, cur_in_data + D, cur_in_data + D);
        }
+        MOVE_ONE_STEP;
-        // F_t * C_t-1
+      }
-        blas.VMUL(D, cur_in_data + D2, prev_c_data, cur_in_data + D2);
+    } else {
-        // I_t * ~C_t
+      for (int step = tstart; step < max_seq_len; ++step) {
-        blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D);
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-        // C_t = F_t * C_t-1 + I_t * ~C_t
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-        blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data);
+        DEFINE_CUR;
+        for (int i = 0; i < cur_bs; ++i) {
-        if (use_peepholes) {
+          COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
-          // + W_oc * C_t for peephole connection
+                       cur_h_out_data);
-          blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
+          MOVE_ONE_BATCH;
-          blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
-                    cur_in_data + D3);
-          // O_t
-          act_gate(D, cur_in_data + D3, cur_in_data + D3);
        }
+        MOVE_ONE_STEP;
-        // hidden out= act_state(cellout) * outgate
-        act_cell(D, cur_c_out_data, cur_in_data + D2);
-        // H_t = O_t * act_state(C_t)
-        blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
-        // move to next data in same batch
-        next_data_in_batch();
      }
-      // move to data for next timestep
-      prev_batch_h_data = cur_batch_h_out_data;
-      prev_batch_c_data = cur_batch_c_out_data;
-      move_step(cur_bs);
    }
+#undef MOVE_ONE_STEP
+#undef MOVE_ONE_BATCH
+#undef DEFINE_CUR
    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
    batched_h_out->set_lod(batched_lod);
@@ -615,6 +565,16 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      BatchCompute(ctx);
    }
  }
+#undef COMPUTE_CtHt_PEEPHOLE
+#undef COMPUTE_CtHt
+#undef GET_Ct_NOH0C0
+#undef COMPUTE_CtHt_NOH0C0
+#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
+#undef GET_Ht
+#undef GET_Ct
+#undef GEMM_WH_ADDON
+#undef INIT_BASE_INPUT_DATAS
 #undef INIT_BASE_SIZES
 #undef INIT_BASE_INPUT_OUTPUT
 #undef INIT_VEC_FUNC

--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -67,27 +67,27 @@ template <typename T, int BlockDim>
 __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
                                 T *y, T *mean, T *var, float epsilon,
                                 int feature_size) {
-  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
  int end_idx = (blockIdx.x + 1) * feature_size;
  // Step 1: Reduce to calculate mean and var
-  T mean_val = static_cast<T>(0);
+  double mean_val = 0;
-  T var_val = static_cast<T>(0);
+  double var_val = 0;
  for (int i = beg_idx; i < end_idx; i += BlockDim) {
    T tmp = x[i];
    mean_val += tmp;
    var_val += (tmp * tmp);
  }
  auto pair = BlockReduce(temp_storage)
-                  .Reduce(PairForLayerNorm<T>(mean_val, var_val),
+                  .Reduce(PairForLayerNorm<double>(mean_val, var_val),
-                          PairForLayerNormAddFunctor<T>());
+                          PairForLayerNormAddFunctor<double>());
  if (threadIdx.x == 0) {
    auto tmp = pair.first_ / feature_size;
-    mean[blockIdx.x] = tmp;
+    mean[blockIdx.x] = static_cast<T>(tmp);
-    var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp;
+    var[blockIdx.x] = static_cast<T>(pair.second_ / feature_size - tmp * tmp);
  }
  __syncthreads();
  mean_val = mean[blockIdx.x];

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -246,6 +246,88 @@ class ReshapeGradKernel {
  }
 };
+// FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape,
+// the XShape is used to carry the shape and lod of X which will be used in
+// reshape_grad, in this way, the framework can reuse the memory of X
+// immediately the reshape_op is finished.
+// Considering compatibility issues, we could not fix reshape_op
+class Reshape2Op : public ReshapeOp {
+ public:
+  Reshape2Op(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : ReshapeOp(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ReshapeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of ReshapeOp should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+class Reshape2OpMaker : public ReshapeOpMaker {
+ public:
+  void Make() override {
+    ReshapeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+class Reshape2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("reshape2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class Reshape2GradOp : public framework::OperatorWithKernel {
+ public:
+  Reshape2GradOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = ctx->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
@@ -261,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                               ops::ReshapeGradKernel, int64_t,
                               ops::ReshapeGradKernel);
+REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
+                  ops::Reshape2GradMaker);
+REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
                                ops::ReshapeKernel, int, ops::ReshapeKernel,
@@ -269,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                double, ops::ReshapeGradKernel, int,
                                ops::ReshapeGradKernel, int64_t,
                                ops::ReshapeGradKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
 #endif
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -126,15 +126,15 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault({});
    AddComment(R"DOC(
        Squeeze Operator.
-        Remove single-dimensional entries from the shape of a tensor. 
+        Remove single-dimensional entries from the shape of a tensor.
-        Takes a parameter axes with a list of axes to squeeze. 
+        Takes a parameter axes with a list of axes to squeeze.
-        If axes is not provided, all the single dimensions will be removed from the shape. 
+        If axes is not provided, all the single dimensions will be removed from the shape.
        If an axis is selected with shape entry not equal to one, an error is raised.
        Examples:
        Case 1:
-          Given 
+          Given
            X.shape = (1, 3, 1, 5)
          and
            axes = [0]
@@ -144,7 +144,7 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
        Case 2:
          Given
            X.shape = (1, 3, 1, 5)
-          and 
+          and
            axes = []
          we get:
            Out.shape = (3, 5)
@@ -181,6 +181,113 @@ class SqueezeGradOp : public framework::OperatorBase {
  }
 };
+// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
+// the XShape is used to carry the shape and lod of X which will be used in
+// squeeze_grad, in this way, the framework can reuse the memory of X
+// immediately the squeeze2_op is finished.
+// Considering compatibility issues, we could not fix squeeze2_op
+class Squeeze2OpMaker : public SqueezeOpMaker {
+ public:
+  void Make() override {
+    SqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in SqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+class Squeeze2OpInferShape : public SqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    SqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Squeeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+class Squeeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims);
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("squeeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class Squeeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+class Squeeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -192,3 +299,8 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
                  ops::SqueezeOpInferShape,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
+REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
+                  ops::Squeeze2OpInferShape, ops::Squeeze2GradOpMaker);
+REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
+                  ops::Squeeze2GradInferShape);
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/transpose_op.h"
+#include <string>
 #include <vector>
 namespace paddle {
@@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
    auto x_dims = ctx->GetInputDim("X");
@@ -90,7 +91,7 @@ The behavior of this operator is similar to how `numpy.transpose` works.
         2 &5
    \end{pmatrix}$$
- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is 
+- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is
 $[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
 )DOC");
@@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null");
@@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
  }
 };
+// FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
+// transpose, the XShape is used to carry the shape and lod of X which
+// will be used in transpose_grad, in this way, the framework can reuse
+// the memory of X immediately the transpose2_op is finished.
+// Considering compatibility issues, we could not fix transpose2_op
+class Transpose2Op : public TransposeOp {
+ public:
+  Transpose2Op(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : TransposeOp(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    TransposeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) should not be null");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> x_shape_dim(in_dims.size() + 1);
+    x_shape_dim[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      x_shape_dim[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(x_shape_dim));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+class Transpose2OpMaker : public TransposeOpMaker {
+ public:
+  void Make() override {
+    TransposeOpMaker::Make();
+    AddOutput("XShape", "(Tensor)The output tensor.").AsIntermediate();
+  }
+};
+class Transpose2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("transpose2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class Transpose2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      auto xshape_dim = ctx->GetInputDim("XShape");
+      auto x_shape_dim =
+          framework::slice_ddim(xshape_dim, 1, xshape_dim.size());
+      ctx->SetOutputDim(framework::GradVarName("X"), x_shape_dim);
+      ctx->ShareLoD("XShape", framework::GradVarName("X"));
+    }
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -120,8 +208,20 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
 REGISTER_OP_CPU_KERNEL(
    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
    transpose_grad,
    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
+                  ops::Transpose2GradMaker);
+REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
+REGISTER_OP_CPU_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL(
 REGISTER_OP_CUDA_KERNEL(
    transpose_grad,
    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -127,13 +127,13 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
        });
    AddComment(R"DOC(
    Unsqueeze Operator.
-    Insert single-dimensional entries to the shape of a tensor. 
-    Takes one required argument axes, a list of dimensions that will be inserted. 
-    Dimension indices in axes are as seen in the output tensor. 
-    For example: 
+    Insert single-dimensional entries to the shape of a tensor.
-      Given a tensor such that tensor with shape [3, 4, 5], 
+    Takes one required argument axes, a list of dimensions that will be inserted.
+    Dimension indices in axes are as seen in the output tensor.
+    For example:
+      Given a tensor such that tensor with shape [3, 4, 5],
      then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
    )DOC");
  }
@@ -168,6 +168,112 @@ class UnsqueezeGradOp : public framework::OperatorBase {
  }
 };
+// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
+// unsqueeze, the XShape is used to carry the shape and lod of X which
+// will be used in unsqueeze_grad, in this way, the framework can reuse
+// the memory of X immediately the unsqueeze2_op is finished.
+// Considering compatibility issues, we could not fix unsqueeze2_op
+class Unsqueeze2OpInferShape : public UnsqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    UnsqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Unsqueeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
+ public:
+  void Make() override {
+    UnsqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in UnsqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+class Unsqueeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Unsqueeze2OpInferShape::GetOutputShape(axes, x_dims);
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape op.
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("unsqueeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+class Unsqueeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+class Unsqueeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -180,3 +286,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
                  ops::UnsqueezeGradInferShape);
+REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
+                  ops::Unsqueeze2OpInferShape, ops::Unsqueeze2GradOpMaker);
+REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
+                  ops::Unsqueeze2GradInferShape);
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -121,6 +121,12 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
    if (nullptr == dso_handle) {
      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
                   << dlerror() << ")";
+      if (dlPath.find("nccl") != std::string::npos) {
+        std::cout
+            << "You may need to install 'nccl2' from NVIDIA official website: "
+            << "https://developer.nvidia.com/nccl/nccl-download"
+            << "before install PaddlePaddle" << std::endl;
+      }
      dlPath = dso_name;
      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
    }

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -115,6 +115,7 @@ function cmake_gen() {
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
        -DPY_VERSION=${PY_VERSION:-2.7}
    ========================================
@@ -144,6 +145,7 @@ EOF
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
        -DPY_VERSION=${PY_VERSION:-2.7}
 }
@@ -498,7 +500,7 @@ EOF
 EOF
    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&"
+        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
    else
        NCCL_DEPS=""
    fi

--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -104,7 +104,7 @@ def batch_images_from_tar(data_file,
                pickle.dump(
                    output,
                    open('%s/batch_%d' % (out_path, file_id), 'wb'),
-                    protocol=pickle.HIGHEST_PROTOCOL)
+                    protocol=2)
                file_id += 1
                data = []
                labels = []
@@ -113,9 +113,7 @@ def batch_images_from_tar(data_file,
        output['label'] = labels
        output['data'] = data
        pickle.dump(
-            output,
+            output, open('%s/batch_%d' % (out_path, file_id), 'wb'), protocol=2)
-            open('%s/batch_%d' % (out_path, file_id), 'wb'),
-            protocol=pickle.HIGHEST_PROTOCOL)
    with open(meta_file, 'a') as meta:
        for file in os.listdir(out_path):

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4025,10 +4025,12 @@ def transpose(x, perm, name=None):
    helper = LayerHelper('transpose', **locals())
    out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
    helper.append_op(
-        type='transpose',
+        type='transpose2',
        inputs={'X': [x]},
-        outputs={'Out': [out]},
+        outputs={'Out': [out],
+                 'XShape': [x_shape]},
        attrs={'axis': perm})
    return out
@@ -4520,13 +4522,15 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                "Each dimension size given in shape must not be negtive "
                "except one unknown dimension.")
-    helper = LayerHelper("reshape", **locals())
+    helper = LayerHelper("reshape2", **locals())
    out = helper.create_tmp_variable(dtype=x.dtype)
+    x_shape = helper.create_tmp_variable(dtype=x.dtype)
    helper.append_op(
-        type="reshape",
+        type="reshape2",
        inputs=inputs,
        attrs={"shape": shape},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
    return helper.append_activation(out)
@@ -4570,11 +4574,13 @@ def squeeze(input, axes, name=None):
    """
    helper = LayerHelper("squeeze", **locals())
    out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
    helper.append_op(
-        type="squeeze",
+        type="squeeze2",
        inputs={"X": input},
        attrs={"axes": axes},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
    return out
@@ -4605,11 +4611,13 @@ def unsqueeze(input, axes, name=None):
    """
    helper = LayerHelper("unsqueeze", **locals())
    out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
    helper.append_op(
-        type="unsqueeze",
+        type="unsqueeze2",
        inputs={"X": input},
        attrs={"axes": axes},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
    return out
@@ -5811,10 +5819,12 @@ def flatten(x, axis=1, name=None):
        raise ValueError("The axis should be a int, and in range [0, rank(x)]")
    out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
    helper.append_op(
-        type='flatten',
+        type='flatten2',
        inputs={"X": x},
-        outputs={'Out': out},
+        outputs={'Out': out,
+                 'XShape': x_shape},
        attrs={"axis": axis})
    return out

--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -47,14 +47,14 @@ def train_program():
    loss = fluid.layers.square_error_cost(input=y_predict, label=y)
    avg_loss = fluid.layers.mean(loss)
-    return avg_loss
+    return [avg_loss, y_predict]
 def optimizer_func():
    return fluid.optimizer.SGD(learning_rate=0.001)
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, params_dirname, inference_model_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    trainer = fluid.Trainer(
@@ -74,6 +74,8 @@ def train(use_cuda, train_program, params_dirname):
                '''
                if params_dirname is not None:
                    trainer.save_params(params_dirname)
+                    trainer.save_inference_model(inference_model_dirname,
+                                                 ['x'], [1])
                trainer.stop()
    trainer.train(
@@ -99,15 +101,55 @@ def infer(use_cuda, inference_program, params_dirname=None):
    print("infer results: ", results[0])
+def infer_by_saved_model(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+        # The input's dimension should be 2-D and the second dim is 13
+        # The input data should be >= 0
+        batch_size = 10
+        test_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=batch_size)
+        test_data = next(test_reader())
+        test_feat = numpy.array(
+            [data[0] for data in test_data]).astype("float32")
+        test_label = numpy.array(
+            [data[1] for data in test_data]).astype("float32")
+        assert feed_target_names[0] == 'x'
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: numpy.array(test_feat)},
+                          fetch_list=fetch_targets)
+        print("infer shape: ", results[0].shape)
+        print("infer results: ", results[0])
+        print("ground truth: ", test_label)
 def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    # Directory for saving the trained model
-    params_dirname = "fit_a_line.inference.model"
+    params_dirname = "fit_a_line.model"
+    inference_model_dirname = "fit_a_line.inference_model"
-    train(use_cuda, train_program, params_dirname)
+    train(use_cuda, train_program, params_dirname, inference_model_dirname)
    infer(use_cuda, inference_program, params_dirname)
+    infer_by_saved_model(use_cuda, inference_model_dirname)
 class TestFitALine(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -36,6 +36,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
 from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.compat as cpt
 from paddle.compat import long_type
 import hashlib
@@ -315,8 +316,9 @@ def pad_batch_data(insts,
    """
    return_list = []
    max_len = max(len(inst) for inst in insts)
-    num_token = reduce(lambda x, y: x + y,
+    num_token = six.moves.reduce(
-                       [len(inst) for inst in insts]) if return_num_token else 0
+        lambda x, y: x + y,
+        [len(inst) for inst in insts]) if return_num_token else 0
    # Any token included in dict can be used to pad, since the paddings' loss
    # will be masked out by weights and make no effect on parameter gradients.
    inst_data = np.array(
@@ -328,7 +330,7 @@ def pad_batch_data(insts,
        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
    else:  # position data
        inst_pos = np.array([
-            range(1, len(inst) + 1) + [0] * (max_len - len(inst))
+            list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
            for inst in insts
        ])
        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
@@ -385,10 +387,11 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx,
        return_num_token=True)
    data_input_dict = dict(
-        zip(data_input_names, [
+        list(
-            src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
+            zip(data_input_names, [
-            trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+                src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
-        ]))
+                trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+            ])))
    return data_input_dict, np.asarray([num_token], dtype="float32")
@@ -561,7 +564,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                        np.log(TrainTaskConfig.label_smooth_eps / (
                            ModelHyperParams.trg_vocab_size - 1) + 1e-20))
    init = False
-    for pass_id in xrange(TrainTaskConfig.pass_num):
+    for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
            if batch_id >= 5:
@@ -587,11 +590,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
                    ModelHyperParams.d_model)
                total_num_token += num_token
-                feed_kv_pairs = data_input_dict.items()
+                feed_kv_pairs = list(data_input_dict.items())
                if TrainTaskConfig.local:
-                    feed_kv_pairs += {
+                    feed_kv_pairs += list({
                        lr_scheduler.learning_rate.name: lr_rate
-                    }.items()
+                    }.items())
                feed_list.append(dict(feed_kv_pairs))
                if not init:
@@ -873,6 +876,7 @@ class DataReader(object):
            f = tarfile.open(fpaths[0], "r")
            for line in f.extractfile(tar_fname):
+                line = cpt.to_text(line)
                fields = line.strip("\n").split(self._field_delimiter)
                if (not self._only_src and len(fields) == 2) or (
                        self._only_src and len(fields) == 1):
@@ -882,8 +886,9 @@ class DataReader(object):
                if not os.path.isfile(fpath):
                    raise IOError("Invalid file: %s" % fpath)
-                with open(fpath, "r") as f:
+                with open(fpath, "rb") as f:
                    for line in f:
+                        line = cpt.to_text(line)
                        fields = line.strip("\n").split(self._field_delimiter)
                        if (not self._only_src and len(fields) == 2) or (
                                self._only_src and len(fields) == 1):
@@ -892,8 +897,9 @@ class DataReader(object):
    @staticmethod
    def load_dict(dict_path, reverse=False):
        word_dict = {}
-        with open(dict_path, "r") as fdict:
+        with open(dict_path, "rb") as fdict:
            for idx, line in enumerate(fdict):
+                line = cpt.to_text(line)
                if reverse:
                    word_dict[idx] = line.strip("\n")
                else:
@@ -1034,7 +1040,7 @@ def multi_head_attention(queries,
        # size of the input as the output dimension size.
        return layers.reshape(
            x=trans_x,
-            shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])))
    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -249,7 +249,7 @@ class OpTest(unittest.TestCase):
        outs, _ = self._calc_output(place)
        return outs
-    def _calc_output(self, place, parallel=False):
+    def _calc_output(self, place, parallel=False, no_check_set=None):
        program = Program()
        block = program.global_block()
@@ -273,6 +273,8 @@ class OpTest(unittest.TestCase):
        # if not, fill the fetch_list by the user configured outputs in test.
        if len(fetch_list) == 0:
            for var_name, var in six.iteritems(outputs):
+                if no_check_set is not None and var_name in no_check_set:
+                    continue
                if isinstance(var, list):
                    for v in var:
                        fetch_list.append(v)
@@ -291,11 +293,17 @@ class OpTest(unittest.TestCase):
                            return_numpy=False)
        return outs, fetch_list
-    def check_output_with_place(self, place, atol, equal_nan=False):
+    def check_output_with_place(self,
-        outs, fetch_list = self._calc_output(place)
+                                place,
+                                atol,
+                                no_check_set=None,
+                                equal_nan=False):
+        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
            if out_name not in self.outputs:
                continue
+            if no_check_set is not None and out_name in no_check_set:
+                continue
            def find_actual(target_name, fetch_list):
                found = [
@@ -360,10 +368,10 @@ class OpTest(unittest.TestCase):
            places.append(core.CUDAPlace(0))
        return places
-    def check_output(self, atol=1e-5, equal_nan=False):
+    def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False):
        places = self._get_places()
        for place in places:
-            self.check_output_with_place(place, atol, equal_nan)
+            self.check_output_with_place(place, atol, no_check_set, equal_nan)
    def check_output_customized(self, checker):
        places = self._get_places()

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -55,6 +55,7 @@ class TestDistRunnerBase(object):
        pserver_prog = t.get_pserver_program(args.current_endpoint)
        startup_prog = t.get_startup_program(args.current_endpoint,
                                             pserver_prog)
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)
@@ -147,6 +148,8 @@ def runtime_main(test_class):
 import paddle.compat as cpt
+import socket
+from contextlib import closing
 class TestDistBase(unittest.TestCase):
@@ -156,13 +159,19 @@ class TestDistBase(unittest.TestCase):
    def setUp(self):
        self._trainers = 2
        self._pservers = 2
-        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
        self._python_interp = "python"
        self._sync_mode = True
        self._mem_opt = False
        self._use_reduce = False
        self._setup_config()
+    def _find_free_port(self):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.bind(('', 0))
+            return s.getsockname()[1]
    def start_pserver(self, model_file, check_error_log):
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"

--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -22,14 +22,17 @@ from op_test import OpTest
 class TestFlattenOp(OpTest):
    def setUp(self):
-        self.op_type = "flatten"
+        self.op_type = "flatten2"
        self.init_test_case()
        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.in_shape).astype("float32")
+        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=["XShape"])
    def test_check_grad(self):
        self.check_grad(["X"], "Out")

--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -53,12 +53,11 @@ class TestFusionLSTMOp(OpTest):
        self.M = 8
        self.D = 16
        self.has_initial_state = False
+        self.use_peepholes = False
        self.is_reverse = False
        self.act_gate = 'sigmoid'
        self.act_cell = 'tanh'
        self.act_cand = 'tanh'
-        self.use_peepholes = False
-        self.use_seq = False
        self.set_conf()
        T = sum(self.lod[0])
@@ -108,7 +107,6 @@ class TestFusionLSTMOp(OpTest):
        }
        self.attrs = {
            'use_peepholes': self.use_peepholes,
-            'use_seq': self.use_seq,
            'is_reverse': self.is_reverse,
            'gate_activation': self.act_gate,
            'cell_activation': self.act_cell,
@@ -178,50 +176,18 @@ class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
        self.is_reverse = True
-class TestFusionLSTMOpPoopholesBS1(TestFusionLSTMOp):
+class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp):
    def set_conf(self):
        self.use_peepholes = True
-        self.lod = [[3]]
-        self.D = 16
-class TestFusionLSTMOpSeqInit(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_seq = True
-        self.has_initial_state = True
-class TestFusionLSTMOpSeqReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_seq = True
-        self.is_reverse = True
-class TestFusionLSTMOpSeqInitReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_seq = True
        self.has_initial_state = True
        self.is_reverse = True
-class TestFusionLSTMOpSeqPeepholes(TestFusionLSTMOp):
+class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
    def set_conf(self):
-        self.use_seq = True
        self.use_peepholes = True
+        self.lod = [[2]]
+        self.D = 8
-class TestFusionLSTMOpSeqPeepholesInit(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_seq = True
-        self.use_peepholes = True
-        self.has_initial_state = True
-class TestFusionLSTMOpSeqPeepholesReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_seq = True
-        self.use_peepholes = True
-        self.is_reverse = True
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -85,6 +85,7 @@ class TestFetchOp(unittest.TestCase):
                    assert not math.isnan(np.sum(ret[i])) and \
                           not math.isinf(np.sum(ret[i]))
+    @unittest.skip(reason="CI timeout")
    def test_fetch_op(self):
        tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
        tst_reader_iter = tst_reader()
@@ -139,6 +140,7 @@ class TestFeedParallel(unittest.TestCase):
            if batch_id == 2:
                break
+    @unittest.skip(reason="CI timeout")
    def test_feed_op(self):
        os.environ['CPU_NUM'] = str(4)
        if core.is_compiled_with_cuda():

--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
+import six
 from op_test import OpTest
@@ -62,17 +63,20 @@ class PReluTest(OpTest):
 # TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
-#  class TestCase1(PReluTest):
+if six.PY2:
-#  def initTestCase(self):
-#  self.attrs = {'mode': "all"}
-#  class TestCase2(PReluTest):
+    class TestCase1(PReluTest):
-#  def initTestCase(self):
+        def initTestCase(self):
-#  self.attrs = {'mode': "channel"}
+            self.attrs = {'mode': "all"}
+    class TestCase2(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "channel"}
+    class TestCase3(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "element"}
-#  class TestCase3(PReluTest):
-#  def initTestCase(self):
-#  self.attrs = {'mode': "element"}
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -22,106 +22,39 @@ from op_test import OpTest
 class TestReshapeOp(OpTest):
    def setUp(self):
-        ori_shape = (2, 25)
+        self.init_data()
-        new_shape = (5, 10)
+        self.op_type = "reshape2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
-        self.op_type = "reshape"
+        self.attrs = {"shape": self.new_shape}
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.outputs = {
-        self.attrs = {"shape": new_shape}
+            "Out": self.inputs["X"].reshape(self.infered_shape),
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-class TestReshapeOpDimInfer1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-class TestReshapeOpDimInfer2(OpTest):
-    def setUp(self):
-        ori_shape = (2, 2, 6)
-        new_shape = (2, 0, 3, -1)
-        infered_shape = (2, 2, 3, -1)
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-class TestReshapeOpInplace(OpTest):
-    def setUp(self):
-        ori_shape = (2, 25)
-        new_shape = (5, 10)
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-class TestReshapeOpDimInferInplace1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
-        self.op_type = "reshape"
+    def init_data(self):
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.ori_shape = (2, 25)
-        self.attrs = {"shape": new_shape}
+        self.new_shape = (5, 10)
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+        self.infered_shape = (5, 10)
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
    def test_check_grad(self):
        self.check_grad(["X"], "Out")
-class TestReshapeOpDimInferInplace2(OpTest):
+class TestReshapeOpDimInfer1(TestReshapeOp):
-    def setUp(self):
+    def init_data(self):
-        ori_shape = (2, 2, 6)
+        self.ori_shape = (5, 10)
-        new_shape = (2, 0, 3, -1)
+        self.new_shape = (5, -1, 5)
-        infered_shape = (2, 2, 3, -1)
+        self.infered_shape = (5, -1, 5)
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
+class TestReshapeOpDimInfer2(TestReshapeOp):
-        self.check_grad(["X"], "Out")
+    def init_data(self):
+        self.ori_shape = (2, 2, 6)
+        self.new_shape = (2, 0, 3, -1)
+        self.infered_shape = (2, 2, 3, -1)
 class TestReshapeOpWithInputShape(OpTest):
@@ -130,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest):
        new_shape = (0, -1, 5)
        actual_shape = (2, 3, 5)
-        self.op_type = "reshape"
+        self.op_type = "reshape2"
        self.inputs = {
            "X": np.random.random(ori_shape).astype("float32"),
            "Shape": np.array(
                actual_shape, dtype="int32")
        }
        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(actual_shape),
+            'XShape': np.random.random(ori_shape).astype("float32")
+        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", sum_outputs=["Out"])
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -23,14 +23,17 @@ from op_test import OpTest
 # Correct: General.
 class TestSqueezeOp(OpTest):
    def setUp(self):
-        self.op_type = "squeeze"
+        self.op_type = "squeeze2"
        self.init_test_case()
        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
    def test_check_grad(self):
        self.check_grad(["X"], "Out")

--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -22,16 +22,19 @@ from op_test import OpTest
 class TestTransposeOp(OpTest):
    def setUp(self):
        self.initTestCase()
-        self.op_type = "transpose"
+        self.op_type = "transpose2"
        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
        self.attrs = {'axis': list(self.axis)}
-        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("float32"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', sum_outputs=['Out'])
    def initTestCase(self):
        self.shape = (3, 4)

--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -24,13 +24,16 @@ from op_test import OpTest
 class TestUnsqueezeOp(OpTest):
    def setUp(self):
        self.init_test_case()
-        self.op_type = "unsqueeze"
+        self.op_type = "unsqueeze2"
        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=["XShape"])
    def test_check_grad(self):
        self.check_grad(["X"], "Out")

--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -431,6 +431,28 @@ class Trainer(object):
            exe = executor.Executor(self.place)
            io.save_persistables(exe, dirname=param_path)
+    def save_inference_model(self, param_path, feeded_var_names,
+                             target_var_indexes):
+        """
+        Save model for cpp inference into :code:`param_path`.
+        Args:
+            param_path(str): The path to save parameters.
+            feeded_var_names(list(str)): The name of the vars that you
+                need to feed in before run program.
+            target_var_indexes(list(int)): the index of target var that
+                you need to return in trainer.train_func.
+        Returns:
+            None
+        """
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            target_vars = [
+                self.train_func_outputs[index] for index in target_var_indexes
+            ]
+            io.save_inference_model(param_path, feeded_var_names, target_vars,
+                                    exe)
    @contextlib.contextmanager
    def _prog_and_scope_guard(self):
        with framework.program_guard(

--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -153,7 +153,7 @@ def block_to_code(block, block_idx):
    indent += 1
    # sort all vars
-    all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0])
+    all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
    for var in all_vars:
        print("{}{}".format(get_indent_space(indent), variable_to_code(var[1])))

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -300,7 +300,7 @@ class DistributeTranspiler(object):
            input_deps = grad_name_to_send_dummy_out.values()
            program.global_block().append_op(
                type="send_barrier",
-                inputs={"X": input_deps},
+                inputs={"X": list(input_deps)},
                outputs={"Out": send_barrier_out},
                attrs={
                    "endpoints": pserver_endpoints,
@@ -401,7 +401,7 @@ class DistributeTranspiler(object):
        Args:
            recv_vars (list): Variable list to recv for current trainer_id
-            eplist (list): A list of strings indicating 
+            eplist (list): A list of strings indicating
        Returns:
            Program: trainer side startup program.
@@ -455,7 +455,7 @@ class DistributeTranspiler(object):
            if len(splited_var) <= 1:
                continue
            # NOTE: if enable memory optimization, origin vars maybe removed.
-            if startup_program.global_block().vars.has_key(varname):
+            if varname in startup_program.global_block().vars:
                orig_param = startup_program.global_block().vars[varname]
            else:
                origin_param_var = self.origin_program.global_block().vars[
@@ -690,7 +690,7 @@ class DistributeTranspiler(object):
        Args:
            endpoint (str): current pserver endpoint.
        Returns:
            tuple: (main_program, startup_program), of type "Program"
        """
@@ -713,7 +713,7 @@ class DistributeTranspiler(object):
            endpoint (str): current pserver endpoint.
            pserver_program (Program): deprecated, call get_pserver_program first.
            startup_program (Program): deprecated, should pass startup_program
-                when initalizing 
+                when initalizing
        Returns:
            Program: parameter server side startup program.