diff --git a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
index 84987ea5daee9abd0fe2fe71bdfde62ea3388ab5..99f8bee5ca1519ccf5d7c35ad2a64da4a8841ada 100644
--- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
+++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
@@ -104,6 +104,7 @@ visualDL --logdir=scratch_log --port=8080
 
 # 访问 http://127.0.0.1:8080
 ```
+如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上，运行`pip install --upgrade protobuf`就能解决。
 
 如果在虚拟环境下仍然遇到安装问题，请尝试以下方法。
 
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 70e5b97770a6c581c6a9c0145b03c42b83f14471..c2694144d708161a3bed214ceca745505656456f 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -43,6 +43,7 @@ paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list',
 paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
+paddle.fluid.Trainer.save_inference_model ArgSpec(args=['self', 'param_path', 'feeded_var_names', 'target_var_indexes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 0d69dfa79aa26940f8f56f84b35ffed34f29f703..9512fd056e73836cdc34a9e409ab2d7809a6aff6 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
 #include <string>
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index f2e18a461fd221252e4a10262a13bc8e942f5988..b625a617a26cf9d34abe9931eaded9bc2797cf08 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -6,6 +6,7 @@ cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits
   analyzer.cc
   helper.cc
   # passes
+  analysis_pass.cc
   fluid_to_data_flow_graph_pass.cc
   data_flow_graph_to_fluid_pass.cc
   dfg_graphviz_draw_pass.cc
@@ -105,6 +106,6 @@ if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFE
   inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
 endif()
 
-inference_analysis_test(test_text_classification SRCS test_text_classification.cc
+inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
     EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
     ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta)
diff --git a/paddle/fluid/inference/analysis/pass.cc b/paddle/fluid/inference/analysis/analysis_pass.cc
similarity index 91%
rename from paddle/fluid/inference/analysis/pass.cc
rename to paddle/fluid/inference/analysis/analysis_pass.cc
index 121b72c0a0aa9a0c568b04f7ee9a5bc5c1d6f5f8..9be9f755b9ed7273d842f8c0e2046f0ca0ce2247 100644
--- a/paddle/fluid/inference/analysis/pass.cc
+++ b/paddle/fluid/inference/analysis/analysis_pass.cc
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
similarity index 59%
rename from paddle/fluid/inference/analysis/pass.h
rename to paddle/fluid/inference/analysis/analysis_pass.h
index 7719c6f5ff3c940948c7bdbcb25513cdf430281b..b6edb5529ace2ad5bd1b35bfbee1f7a744457cc3 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -28,10 +28,10 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-class Pass {
+class AnalysisPass {
  public:
-  Pass() = default;
-  virtual ~Pass() = default;
+  AnalysisPass() = default;
+  virtual ~AnalysisPass() = default;
   // Mutable Pass.
   virtual bool Initialize(Argument *argument) { return false; }
   // Readonly Pass.
@@ -42,23 +42,16 @@ class Pass {
   virtual bool Finalize() { return false; }
 
   // Get a Pass appropriate to print the Node this pass operates on.
-  virtual Pass *CreatePrinterPass(std::ostream &os,
-                                  const std::string &banner) const {
+  virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
+                                          const std::string &banner) const {
     return nullptr;
   }
 
   // Create a debugger Pass that draw the DFG by graphviz toolkit.
-  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
+  virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
 
-  virtual void Run() { LOG(FATAL) << "not valid"; }
-  // Run on a single Node.
-  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single Function.
-  virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
-  // Run on a single FunctionBlock.
-  virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
   // Run on a single DataFlowGraph.
-  virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+  virtual void Run(DataFlowGraph *x) = 0;
 
   // Human-readable short representation.
   virtual std::string repr() const = 0;
@@ -66,29 +59,8 @@ class Pass {
   virtual std::string description() const { return "No DOC"; }
 };
 
-// NodePass process on any Node types.
-class NodePass : public Pass {
- public:
-  virtual void Run(Node *node) = 0;
-};
-
-// NodePass process on any Function node types.
-class FunctionPass : public Pass {
- public:
-  virtual void Run(Function *node) = 0;
-};
-
-// NodePass process on any FunctionBlock node types.
-class FunctionBlockPass : public Pass {
- public:
-  virtual void Run(FunctionBlock *node) = 0;
-};
-
 // GraphPass processes on any GraphType.
-class DataFlowGraphPass : public Pass {
- public:
-  virtual void Run(DataFlowGraph *graph) = 0;
-};
+class DataFlowGraphPass : public AnalysisPass {};
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 1fd884435d173800563ea37809003ed3aee16c7c..6dc39cae0522efd48c2e2921611adebd6937ddf7 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
@@ -58,7 +59,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
   std::string description() const override { return "DFG pass manager."; }
 
  private:
-  void AddPass(const std::string& name, Pass* pass) {
+  void AddPass(const std::string& name, AnalysisPass* pass) {
     VLOG(3) << "Adding pass " << name;
     Register(name, pass);
     AddGraphvizDebugerPass(pass);
@@ -87,7 +88,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
   }
 
   // Add the graphviz debuger pass if the parent pass has one.
-  void AddGraphvizDebugerPass(Pass* pass) {
+  void AddGraphvizDebugerPass(AnalysisPass* pass) {
     auto* debuger_pass = pass->CreateGraphvizDebugerPass();
     if (debuger_pass) {
       Register(debuger_pass->repr(), debuger_pass);
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 3fdd2b9ec7537c891a04efb3ca9a1d45075ffa5e..abc3021e7ec3f0f970d786b782ad17510b8bdbd8 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -36,8 +36,11 @@ limitations under the License. */
  */
 
 #include <gflags/gflags.h>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/flags.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/analysis/test_text_classification.cc b/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc
similarity index 94%
rename from paddle/fluid/inference/analysis/test_text_classification.cc
rename to paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc
index 2913824f62301795aea967c22021b2af11f343c1..265e814acd594d6185251cbaa4d6880bb9ee7405 100644
--- a/paddle/fluid/inference/analysis/test_text_classification.cc
+++ b/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/api/timer.h"
 
 DEFINE_string(infer_model, "", "Directory of the inference model.");
@@ -100,10 +101,3 @@ void Main(int batch_size) {
 TEST(text_classification, basic) { Main(FLAGS_batch_size); }
 
 }  // namespace paddle
-
-USE_PASS(fc_fuse_pass);
-USE_PASS(seq_concat_fc_fuse_pass);
-USE_PASS(fc_lstm_fuse_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(attention_lstm_fuse_pass);
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 80c85555e722433f3657e880520b3fe459f6ce1a..8579845d51e80d73d220465d25b70944f5ad9bf2 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -263,7 +263,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace
 
-Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
       FLAGS_IA_graphviz_log_root,
       "data_flow_graph_to_fluid_graphviz_debugger"));
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index 0c9a8a0b7cae17bf2eaa714348ea1c9b5e43611b..891c7226e245fa3b92892785362c186185a61f62 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -21,8 +21,8 @@
 
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
 namespace inference {
@@ -42,7 +42,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
     return "Transform a DFG to a Fluid ProgramDesc";
   }
 
-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;
 
  protected:
   // Add a Fluid Op into the ProgramDesc.
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
index 17445ab4407a159ca11345bc9a9226b3ad0044f0..e537bfc0e64d4ff46b3d61499a1a0298ed83533f 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include <fstream>
 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/dot.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 51bd0ac42d455f68ac5d70f0ce9703dfad6070d4..2b7d632c839e735ca03c6e17b94307b40cc13374 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -66,7 +66,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }
 
-Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
       FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
 }
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index fb948bf2242abcbc1e841fd3b8457e63358782c5..b9e262020e9522e167b998d57e2be2ac19b48447 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -22,8 +22,8 @@
 #include <string>
 
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
 namespace inference {
@@ -46,7 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
     return "transform a fluid ProgramDesc to a data flow graph.";
   }
 
-  Pass *CreateGraphvizDebugerPass() const override;
+  AnalysisPass *CreateGraphvizDebugerPass() const override;
 
  private:
   framework::proto::ProgramDesc const *desc_;
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
index 3086085710d6e850ed27e82d2323690dfdd3ef19..c2599e218a2306f9353b843b7ea3f18aeacb008e 100644
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
@@ -14,15 +14,17 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
-using namespace framework;
 
 static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__";
 
@@ -48,7 +50,8 @@ class FluidToIrPass final : public DataFlowGraphPass {
     ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
     // Load program.
     auto program = LoadProgramDesc(*argument->fluid_model_program_path);
-    argument->origin_program_desc.reset(new proto::ProgramDesc(program));
+    argument->origin_program_desc.reset(
+        new framework::proto::ProgramDesc(program));
     // Create main data flow graph.
     if (!argument->main_dfg) {
       argument->main_dfg.reset(new DataFlowGraph);
@@ -78,12 +81,13 @@ class FluidToIrPass final : public DataFlowGraphPass {
     IRPassManager ir_passes(argument_->Get<ProgramDesc>("ir_program_desc"),
                             nullptr);
     // Pass the scope from analysis to IR if needed.
-    if (argument_->Has(ir::kParamScopeAttr)) {
+    if (argument_->Has(framework::ir::kParamScopeAttr)) {
       // Here the address is passed, attention that IR doesn't own the scope, so
       // the real scope in analysis should live during the IR phase.
       ir_passes.graph().Set(
-          ir::kParamScopeAttr,
-          new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr)));
+          framework::ir::kParamScopeAttr,
+          new framework::Scope *(&argument_->Get<framework::Scope>(
+              framework::ir::kParamScopeAttr)));
     }
 
     if (FLAGS_IA_enable_ir) {
@@ -95,12 +99,12 @@ class FluidToIrPass final : public DataFlowGraphPass {
     PADDLE_ENFORCE(argument_->main_dfg.get());
     argument_->main_dfg->Build(ir_passes.graph());
     // inherit the arguments from ir.
-    if (ir_passes.graph().Has(ir::kFuseStatisAttr)) {
+    if (ir_passes.graph().Has(framework::ir::kFuseStatisAttr)) {
       argument_->Set(
-          ir::kFuseStatisAttr,
+          framework::ir::kFuseStatisAttr,
           new std::unordered_map<std::string, int>(
               ir_passes.graph().Get<std::unordered_map<std::string, int>>(
-                  ir::kFuseStatisAttr)));
+                  framework::ir::kFuseStatisAttr)));
     }
   }
 
@@ -112,7 +116,7 @@ class FluidToIrPass final : public DataFlowGraphPass {
 
  private:
   // Load parameters from a single file or from a directory.
-  bool LoadParams(Scope *scope, const std::string &dir,
+  bool LoadParams(framework::Scope *scope, const std::string &dir,
                   const std::string &prog_file, const std::string &param_file);
 
  private:
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h
index 3a2869e30bd80cfd0756f8e21acb414656620eaa..f14b49e09c2f8e79c6fc4accdbf17f4f7a9bb1a3 100644
--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -19,7 +19,7 @@
 
 #pragma once
 #include <string>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
index ff5ec94265a4f05c1294ad6c8ac5f86c249b84b6..759b2b96a1944c060ac98b6865b58ba2f6369607 100644
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -40,17 +40,6 @@ void DfgPassManager::RunAll() {
   }
 }
 
-void NodePassManager::RunAll() {
-  PADDLE_ENFORCE(argument_);
-  PADDLE_ENFORCE(argument_->main_dfg.get());
-  auto trait = GraphTraits<DataFlowGraph>(*argument_->main_dfg).nodes_in_DFS();
-  for (auto& node : trait) {
-    for (auto& pass : data_) {
-      pass->Run(&node);
-    }
-  }
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h
index 81a17e0287a5aef8a328e43380ee3691f5a32379..412747c4fcce73303703f586f7a04edf4cc5ee76 100644
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -33,7 +33,7 @@ limitations under the License. */
 
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -43,7 +43,7 @@ namespace analysis {
  * PassManager is the base class for all pass managers, a pass manager has
  * several Pass-es registered, and execute them in the linear order.
  */
-class PassManager : public OrderedRegistry<Pass> {
+class PassManager : public OrderedRegistry<AnalysisPass> {
  public:
   PassManager() = default;
   // Call all the passes' Initialize methods. The desc and data_flow_graph are
@@ -89,18 +89,6 @@ class DfgPassManager : public PassManager {
   virtual ~DfgPassManager() = default;
 };
 
-/*
- * A pass manager that process a Node each time.
- */
-class NodePassManager : public PassManager {
- public:
-  NodePassManager() = default;
-
-  void RunAll() override;
-
-  virtual ~NodePassManager() = default;
-};
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index 13423e4837e12a96e7a5dfc9ca3f59bf8b14746a..72b0fbf7e571ec97a0ea093d01449c1d5ddb9b91 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -34,28 +34,6 @@ class TestDfgPassManager final : public DfgPassManager {
   std::string description() const override { return "test doc"; }
 };
 
-class TestNodePassManager final : public NodePassManager {
- public:
-  virtual ~TestNodePassManager() = default;
-
-  std::string repr() const override { return "test-node-pass-manager"; }
-  std::string description() const override { return "test doc"; }
-};
-
-class TestNodePass final : public NodePass {
- public:
-  virtual ~TestNodePass() = default;
-
-  bool Initialize(Argument* argument) override { return true; }
-
-  void Run(Node* node) override {
-    LOG(INFO) << "- Processing node " << node->repr();
-  }
-
-  std::string repr() const override { return "test-node"; }
-  std::string description() const override { return "some doc"; }
-};
-
 TEST(PassManager, DFG_pass_manager) {
   TestDfgPassManager manager;
   DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
@@ -71,19 +49,6 @@ TEST(PassManager, DFG_pass_manager) {
   manager.RunAll();
 }
 
-TEST(PassManager, Node_pass_manager) {
-  Argument argument(FLAGS_inference_model_dir);
-  // Pre-process: initialize the DFG with the ProgramDesc first.
-  FluidToDataFlowGraphPass pass0;
-  pass0.Initialize(&argument);
-  pass0.Run(argument.main_dfg.get());
-
-  TestNodePassManager manager;
-  manager.Register("test-node-pass", new TestNodePass);
-  ASSERT_TRUE(manager.Initialize(&argument));
-  manager.RunAll();
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
index 9f51fafe0b2a66f9d062a6b751fe7a3bc662ce7c..174c8513f92cf869419f04cab5a54af65e9673b8 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -68,7 +68,7 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
   }
 };
 
-Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+AnalysisPass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
   DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
                                       "tensorrt_marked_node");
   return new DfgDebuggerPass(config);
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
index c558a6ebbde371071c7330a14cc986bf764d1773..c881a54c240538b68abdcb9060db69de3bf2b8bb 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <string>
-#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 
 namespace paddle {
@@ -48,7 +48,7 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
     return "tensorrt sub-graph mark pass";
   }
 
-  Pass* CreateGraphvizDebugerPass() const override;
+  AnalysisPass* CreateGraphvizDebugerPass() const override;
   bool Finalize() override;
 
  private:
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
index c6741a92095d33d261a4e1667c87a8ca02e51a9f..219e3f5470f627e81005aabf94f9c72c33fd2eed 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/node.h"
-#include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index ea00bf364951b0a4304b380df492d00e84451136..6b8278a0395c9ae71e32337d9735409de7ba0c96 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -44,19 +44,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api
-          analysis
-          ir_pass_manager
-          pass
-          fc_fuse_pass
-          fc_lstm_fuse_pass
-          seq_concat_fc_fuse_pass
-          graph_viz_pass
-          infer_clean_graph_pass
-          graph_pattern_detector
-          infer_clean_graph_pass
-          attention_lstm_fuse_pass
-  )
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
 
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index 582c75872ab2818cdf834f9a46278db1d6f91d54..916f84cb4a78c3721cb67bd3cf8d3759a8eaf1bf 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -30,14 +30,7 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                  "Input(WeightX) of GRU should not be null.");
   PADDLE_ENFORCE(ctx->HasInput("WeightH"),
                  "Input(WeightH) of GRU should not be null.");
-
   PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                 "Output(ReorderedH0) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                 "Output(BatchedInput) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
-                 "Output(BatchedOut) of GRU should not be null.");
   PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                  "Output(Hidden) of GRU should not be null.");
 
@@ -80,15 +73,20 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
   }
   framework::DDim out_dims({x_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchedOut", out_dims);
   ctx->ShareLoD("X", "Hidden");
-
   int xx_width;
   if (ctx->Attrs().Get<bool>("use_seq")) {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Output(ReorderedH0) of GRU should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Output(BatchedInput) of GRU should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
+                   "Output(BatchedOut) of GRU should not be null.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedOut", out_dims);
   }
   ctx->SetOutputDim("XX", {x_dims[0], xx_width});
   ctx->ShareLoD("X", "XX");
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index 104e160e2d7069ec247cc51e927ce8824f1b69e8..ef23ab3f981786d33567619ad0194d21f31bdc8e 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -38,16 +38,6 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                  "Output(Hidden) of LSTM should not be null.");
   PADDLE_ENFORCE(ctx->HasOutput("Cell"),
                  "Output(Cell) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                 "Output(BatchedInput) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
-                 "Output(BatchedHidden) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
-                 "Output(BatchedCell) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                 "Output(ReorderedH0) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
-                 "Output(ReorderedC0) of LSTM should not be null.");
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
@@ -88,28 +78,36 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
   PADDLE_ENFORCE_EQ(b_dims[0], 1,
                     "The first dimension of Input(Bias) should be 1.");
-
-  auto use_peepholes = ctx->Attrs().Get<bool>("use_peepholes");
-  PADDLE_ENFORCE_EQ(b_dims[1], (use_peepholes ? 7 : 4) * frame_size,
-                    "The second dimension of Input(Bias) should be "
-                    "7 * %d if enable peepholes connection or"
-                    "4 * %d if disable peepholes",
-                    frame_size, frame_size);
+  PADDLE_ENFORCE_EQ(
+      b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
+      "The second dimension of Input(Bias) should be "
+      "7 * %d if enable peepholes connection or"
+      "4 * %d if disable peepholes",
+      frame_size, frame_size);
 
   framework::DDim out_dims({x_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
   ctx->SetOutputDim("Cell", out_dims);
-  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchedHidden", out_dims);
-  ctx->SetOutputDim("BatchedCell", out_dims);
   ctx->ShareLoD("X", "Hidden");
   ctx->ShareLoD("X", "Cell");
-
   int xx_width;
   if (ctx->Attrs().Get<bool>("use_seq")) {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Output(BatchedInput) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
+                   "Output(BatchedHidden) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
+                   "Output(BatchedCell) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Output(ReorderedH0) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
+                   "Output(ReorderedC0) of LSTM should not be null.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedHidden", out_dims);
+    ctx->SetOutputDim("BatchedCell", out_dims);
   }
   ctx->SetOutputDim("XX", {x_dims[0], xx_width});
   ctx->ShareLoD("X", "XX");
@@ -232,18 +230,18 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
     act_cand = act_functor(act_cand_str);                                      \
   }
 
-#define INIT_BASE_INPUT_OUTPUT                          \
-  auto* x = ctx.Input<LoDTensor>("X");                  \
-  auto* h0 = ctx.Input<Tensor>("H0");                   \
-  auto* c0 = ctx.Input<Tensor>("C0");                   \
-  auto* wx = ctx.Input<Tensor>("WeightX");              \
-  auto* wh = ctx.Input<Tensor>("WeightH");              \
-  auto* bias = ctx.Input<Tensor>("Bias");               \
-  auto* xx = ctx.Output<LoDTensor>("XX");               \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");   \
-  auto* cell_out = ctx.Output<LoDTensor>("Cell");       \
-  bool use_peepholes = ctx.Attr<bool>("use_peepholes"); \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");
+#define INIT_BASE_INPUT_OUTPUT                        \
+  auto* x = ctx.Input<LoDTensor>("X");                \
+  auto* h0 = ctx.Input<Tensor>("H0");                 \
+  auto* c0 = ctx.Input<Tensor>("C0");                 \
+  auto* wx = ctx.Input<Tensor>("WeightX");            \
+  auto* wh = ctx.Input<Tensor>("WeightH");            \
+  auto* bias = ctx.Input<Tensor>("Bias");             \
+  auto* xx = ctx.Output<LoDTensor>("XX");             \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");     \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes");
 
 #define INIT_BASE_SIZES                  \
   auto x_dims = x->dims();   /* T x M*/  \
@@ -254,172 +252,183 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
   const int D3 = D * 3;                  \
   const int D4 = wh_dims[1];
 
+#define INIT_BASE_INPUT_DATAS                                        \
+  const T* x_data = x->data<T>();                                    \
+  const T* wx_data = wx->data<T>();                                  \
+  const T* wh_data = wh->data<T>();                                  \
+  /* diagonal weight*/                                               \
+  const T* wc_data = bias->data<T>() + D4;                           \
+  /* for peephole only*/                                             \
+  Tensor checked_cell;                                               \
+  T* checked_cell_data = nullptr;                                    \
+  auto place = ctx.GetPlace();                                       \
+  if (use_peepholes) {                                               \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                 \
+    checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
+  }
+
+/// Compute LSTM
+#define GEMM_WH_ADDON(bs, prev, out)                                           \
+  blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
+            wh_data, D4, static_cast<T>(1), out, D4)
+
+// gates: W_ch, W_ih, W_fh, W_oh
+#define GET_Ct(ct_1, gates, ct)                   \
+  /* C_t = C_t-1 * fgated + cand_gated * igated*/ \
+  act_cand(D, gates, gates);                      \
+  blas.VMUL(D, gates, gates + D, gates + D);      \
+  blas.VMUL(D, ct_1, gates + D2, gates + D2);     \
+  blas.VADD(D, gates + D, gates + D2, ct)
+
+#define GET_Ht(ct, gates, ht)        \
+  /* H_t = act_cell(C_t) * ogated */ \
+  act_cell(D, ct, gates + D2);       \
+  blas.VMUL(D, gates + D2, gates + D3, ht)
+
+#define GET_Ct_NOH0C0(gates, ct)     \
+  /* C_t = igated * cgated*/         \
+  act_gate(D, gates + D, gates + D); \
+  act_cand(D, gates, gates);         \
+  blas.VMUL(D, gates, gates + D, ct)
+
+#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                \
+  act_gate(D, gates + D3, gates + D3);     \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                         \
+  /* get outgated, put W_oc * C_t on igated */      \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);        \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);  \
+  act_gate(D, gates + D3, gates + D3);              \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
+  act_gate(D3, gates + D, gates + D);     \
+  GET_Ct(ct_1, gates, ct);                \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht)        \
+  /* get fgated and igated*/                              \
+  blas.VMUL(D, wc_data, ct_1, checked_cell_data);         \
+  blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
+  blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
+  act_gate(D2, gates + D, gates + D);                     \
+  GET_Ct(ct_1, gates, ct);                                \
+  /* get ogated*/                                         \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);              \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);        \
+  act_gate(D, gates + D3, gates + D3);                    \
+  GET_Ht(ct, gates, ht)
+
   void SeqCompute(const framework::ExecutionContext& ctx) const {
     using DeviceContext = paddle::platform::CPUDeviceContext;
     INIT_BASE_INPUT_OUTPUT
     INIT_BASE_SIZES
     INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
 
     auto x_lod = x->lod();
     const int total_T = x_dims[0];
-    const int N = x_lod[0].size() - 1;  // batch size
-
-    const T* x_data = x->data<T>();
+    const int N = x_lod[0].size() - 1;
     const T* h0_data = h0 ? h0->data<T>() : nullptr;
     const T* c0_data = c0 ? c0->data<T>() : nullptr;
-    const T* bias_data = bias->data<T>();
-    const T* wc_data = bias_data + D4;  // w_ic, w_fc, w_oc
-    const T* wx_data = wx->data<T>();
-    const T* wh_data = wh->data<T>();
-
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
-    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
-
-    // use local variable
-    framework::DDim check_dims({3, D});
-    Tensor checked_cell;  // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
-    auto checked_cell_data =
-        checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());
-
+    T* xx_data = xx->mutable_data<T>(place);
+    T* h_out_data = hidden_out->mutable_data<T>(place);
+    T* c_out_data = cell_out->mutable_data<T>(place);
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
     math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
                                       xx_data, bias->data<T>());
+
     int xx_offset = D4;
     int gate_offset = D;
     if (is_reverse) {
       const int offset = (total_T - 1) * D;
       xx_data = xx_data + offset * 4;
-      hidden_out_data = hidden_out_data + offset;
-      cell_out_data = cell_out_data + offset;
+      h_out_data = h_out_data + offset;
+      c_out_data = c_out_data + offset;
       xx_offset = -D4;
       gate_offset = -D;
     }
 
-    auto move_step = [&]() {
-      xx_data = xx_data + xx_offset;
-      hidden_out_data = hidden_out_data + gate_offset;
-      cell_out_data = cell_out_data + gate_offset;
-    };
-
-    for (int i = 0; i < N; ++i) {
-      int bid = is_reverse ? N - 1 - i : i;
-      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
-      const T* prev_c_data = nullptr;
-      const T* prev_h_data = nullptr;
-
-      int tstart = 0;
-      if (h0_data) {
-        prev_h_data = h0_data + bid * D;
-        prev_c_data = c0_data + bid * D;
-      } else {
-        // If step == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros. Then W_h * H_t-1 can be skipped
-
-        // ~C_t
-        act_cand(D, xx_data, xx_data);
-        if (use_peepholes) {
-          // I_t, F_t
-          act_gate(D2, xx_data + D, xx_data + D);
-        } else {
-          // I_t, F_t, O_t
-          act_gate(D3, xx_data + D, xx_data + D);
-        }
-        // C_t = I_t * ~C_t
-        blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
-
-        if (use_peepholes) {
-          // + W_oc * C_t for peephole connection
-          blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
-          blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
-          // O_t
-          act_gate(D, xx_data + D3, xx_data + D3);
-        }
-
-        // hidden out= act_state(cellout) * outgate
-        act_cell(D, cell_out_data, xx_data + D2);
-        // H_t = O_t * act_state(C_t)
-        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
-
-        // prev
-        prev_h_data = hidden_out_data;
-        prev_c_data = cell_out_data;
-
-        tstart = 1;
-        move_step();
-      }
-
-      for (int step = tstart; step < seq_len; ++step) {
-        // + W_h * H_t-1
-        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
-                  prev_h_data, D, wh_data, D4, static_cast<T>(1), xx_data, D4);
+#define MOVE_ONE_STEP                    \
+  prev_h_data = h_out_data;              \
+  prev_c_data = c_out_data;              \
+  xx_data = xx_data + xx_offset;         \
+  h_out_data = h_out_data + gate_offset; \
+  c_out_data = c_out_data + gate_offset
+
+#define PROCESS_H0C0_DEFINES                       \
+  int bid = is_reverse ? N - 1 - i : i;            \
+  int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
+  const T* prev_c_data = nullptr;                  \
+  const T* prev_h_data = nullptr;                  \
+  int tstart = 0
+
+#define PROCESS_H0C0_PEEPHOLE                                      \
+  PROCESS_H0C0_DEFINES;                                            \
+  if (h0_data) {                                                   \
+    prev_h_data = h0_data + bid * D;                               \
+    prev_c_data = c0_data + bid * D;                               \
+  } else {                                                         \
+    COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
+    MOVE_ONE_STEP;                                                 \
+    tstart = 1;                                                    \
+  }
 
-        // ~C_t
-        act_cand(D, xx_data, xx_data);
+#define PROCESS_H0C0                                      \
+  PROCESS_H0C0_DEFINES;                                   \
+  if (h0_data) {                                          \
+    prev_h_data = h0_data + bid * D;                      \
+    prev_c_data = c0_data + bid * D;                      \
+  } else {                                                \
+    COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
+    MOVE_ONE_STEP;                                        \
+    tstart = 1;                                           \
+  }
 
-        if (use_peepholes) {
-          // + W_ic|W_fc * C_t-1 for peephole connection
-          blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
-          blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
-          blas.VADD(D2, xx_data + D, checked_cell_data, xx_data + D);
-          // I_t, F_t
-          act_gate(D2, xx_data + D, xx_data + D);
-        } else {
-          // I_t, F_t, O_t
-          act_gate(D3, xx_data + D, xx_data + D);
+    if (use_peepholes) {
+      for (int i = 0; i < N; ++i) {
+        PROCESS_H0C0_PEEPHOLE
+        for (int step = tstart; step < seq_len; ++step) {
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
+          COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
+          MOVE_ONE_STEP;
         }
-
-        // F_t * C_t-1
-        blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2);
-        // I_t * ~C_t
-        blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
-        // C_t = F_t * C_t-1 + I_t * ~C_t
-        blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
-
-        if (use_peepholes) {
-          // + W_oc * C_t for peephole connection
-          blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
-          blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
-          // O_t
-          act_gate(D, xx_data + D3, xx_data + D3);
+      }
+    } else {
+      for (int i = 0; i < N; ++i) {
+        PROCESS_H0C0
+        for (int step = tstart; step < seq_len; ++step) {
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
+          COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data);
+          MOVE_ONE_STEP;
         }
-
-        // hidden out= act_state(cellout) * outgate
-        act_cell(D, cell_out_data, xx_data + D2);
-        // H_t = O_t * act_state(C_t)
-        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
-
-        // prev
-        prev_h_data = hidden_out_data;
-        prev_c_data = cell_out_data;
-
-        move_step();
-      }  // for each step in batch
-    }    // for each batch
+      }
+    }
+#undef PROCESS_H0C0_DEFINES
+#undef PROCESS_H0C0_PEEPHOLE
+#undef PROCESS_H0C0
+#undef MOVE_ONE_STEP
   }
 
   void BatchCompute(const framework::ExecutionContext& ctx) const {
     using DeviceContext = platform::CPUDeviceContext;
     INIT_BASE_INPUT_OUTPUT
-    if (x->lod()[0].size() == 2) {  // batch size == 1
+    if (x->lod()[0].size() == 2) {
       SeqCompute(ctx);
       return;
     }
     INIT_BASE_SIZES
     INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
 
     auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
     auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
     auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
     auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
     auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
-
-    const T* x_data = x->data<T>();
-    const T* wx_data = wx->data<T>();
-    const T* wh_data = wh->data<T>();
-    const T* bias_data = bias->data<T>();
-    const T* wc_data = bias_data + D4;  // w_ic, w_fc, w_oc
-    auto place = ctx.GetPlace();
     T* xx_data = xx->mutable_data<T>(place);
     T* batched_input_data = batched_input->mutable_data<T>(place);
     T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
@@ -427,12 +436,6 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
     hidden_out->mutable_data<T>(place);
     cell_out->mutable_data<T>(place);
 
-    // use local variable
-    framework::DDim check_dims({3, D});
-    Tensor checked_cell;  // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
-    auto checked_cell_data =
-        checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());
-
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
@@ -454,27 +457,17 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
     reordered_h0->Resize({max_bs, D});
     reordered_c0->Resize({max_bs, D});
 
-    T* prev_batch_h_data = nullptr;
-    T* prev_batch_c_data = nullptr;
-    T* cur_batch_in_data = batched_input_data;
-    T* cur_batch_h_out_data = batched_h_out_data;
-    T* cur_batch_c_out_data = batched_c_out_data;
-
-    auto move_step = [&](int bs) {
-      cur_batch_in_data += bs * D4;
-      cur_batch_c_out_data += bs * D;
-      cur_batch_h_out_data += bs * D;
-    };
-
     int tstart = 0;
+    T* prev_h_data = nullptr;
+    T* prev_c_data = nullptr;
     if (h0) {
       // reorder h0, c0
       T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
       T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
       const T* h0_data = h0->data<T>();
       const T* c0_data = c0->data<T>();
-      prev_batch_h_data = reordered_h0_data;
-      prev_batch_c_data = reordered_c0_data;
+      prev_h_data = reordered_h0_data;
+      prev_c_data = reordered_c0_data;
       size_t sz = sizeof(T) * D;
       for (int i = 0; i < max_bs; ++i) {
         std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
@@ -483,123 +476,80 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
         reordered_c0_data += D;
       }
     } else {
-      // Compute with no H0/C0
-      T* cur_in_data = cur_batch_in_data;
-      T* cur_c_out_data = cur_batch_c_out_data;
-      T* cur_h_out_data = cur_batch_h_out_data;
-
-      // If step == 0 and there is no initialized hidden state, that is to say
-      // the H0 is zeros. Then W_h * H_t-1 can be skiped
-
-      for (int i = 0; i < max_bs; ++i) {  // iterate each data in 1st batch
-        // ~C_t
-        act_cand(D, cur_in_data, cur_in_data);
-
-        if (use_peepholes) {
-          // I_t, F_t
-          act_gate(D2, cur_in_data + D, cur_in_data + D);
-        } else {
-          // I_t, F_t, O_t
-          act_gate(D3, cur_in_data + D, cur_in_data + D);
-        }
-
-        // C_t = I_t * ~C_t
-        blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data);
-
+      // compute without h0, c0
+      T* cur_in_data = batched_input_data;
+      T* cur_h_out_data = batched_h_out_data;
+      T* cur_c_out_data = batched_c_out_data;
+      for (int i = 0; i < max_bs; ++i) {
+        GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
         if (use_peepholes) {
-          // + W_oc * C_t for peephole connection
-          blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
-          blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
-                    cur_in_data + D3);
-          // O_t
-          act_gate(D, cur_in_data + D3, cur_in_data + D3);
+          blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
+          blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
         }
-
-        // hidden out= act_state(cellout) * outgate
-        act_cell(D, cur_c_out_data, cur_in_data + D2);
-        // H_t = O_t * act_state(C_t)
-        blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
-
-        // move to next data in the same batch
+        act_gate(D, cur_in_data + D3, cur_in_data + D3);
+        GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
         cur_in_data += D4;
         cur_c_out_data += D;
         cur_h_out_data += D;
       }
-
-      // move to data for next timestep
-      prev_batch_h_data = cur_batch_h_out_data;
-      prev_batch_c_data = cur_batch_c_out_data;
-      move_step(max_bs);
       tstart = 1;
+      prev_h_data = batched_h_out_data;
+      prev_c_data = batched_c_out_data;
     }
-
     const auto& batch_starts = batched_lod[0];
     const int max_seq_len = batch_starts.size() - 1;
-    for (int step = tstart; step < max_seq_len; ++step) {
-      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-      // + W_h * H_t-1
-      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast<T>(1),
-                prev_batch_h_data, D, wh_data, D4, static_cast<T>(1),
-                cur_batch_in_data, D4);
-
-      T* cur_in_data = cur_batch_in_data;
-      T* cur_c_out_data = cur_batch_c_out_data;
-      T* cur_h_out_data = cur_batch_h_out_data;
-      T* prev_c_data = prev_batch_c_data;  // NULL if no C0 in step0
-      T* prev_h_data = prev_batch_h_data;  // NULL if no H0 in step0
-      auto next_data_in_batch = [&]() {
-        cur_in_data += D4;
-        cur_c_out_data += D;
-        cur_h_out_data += D;
-        prev_c_data = prev_c_data ? prev_c_data + D : nullptr;
-        prev_h_data = prev_h_data ? prev_h_data + D : nullptr;
-      };
-
-      for (int i = 0; i < cur_bs; ++i) {  // iterate each data in same batch
-        // ~C_t
-        act_cand(D, cur_in_data, cur_in_data);
-
-        if (use_peepholes) {
-          // + W_ic|W_fc * C_t-1 for peephole connection
-          blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
-          blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
-          blas.VADD(D2, cur_in_data + D, checked_cell_data, cur_in_data + D);
-          // I_t, F_t
-          act_gate(D2, cur_in_data + D, cur_in_data + D);
-        } else {
-          // I_t, F_t, O_t
-          act_gate(D3, cur_in_data + D, cur_in_data + D);
+    const int offset = tstart * max_bs * D;
+    batched_input_data = batched_input_data + offset * 4;
+    batched_h_out_data = batched_h_out_data + offset;
+    batched_c_out_data = batched_c_out_data + offset;
+
+#define DEFINE_CUR                        \
+  T* cur_in_data = batched_input_data;    \
+  T* cur_prev_c_data = prev_c_data;       \
+  T* cur_c_out_data = batched_c_out_data; \
+  T* cur_h_out_data = batched_h_out_data
+
+#define MOVE_ONE_BATCH  \
+  cur_in_data += D4;    \
+  cur_prev_c_data += D; \
+  cur_c_out_data += D;  \
+  cur_h_out_data += D
+
+#define MOVE_ONE_STEP                  \
+  prev_c_data = batched_c_out_data;    \
+  prev_h_data = batched_h_out_data;    \
+  batched_c_out_data = cur_c_out_data; \
+  batched_h_out_data = cur_h_out_data; \
+  batched_input_data = cur_in_data
+
+    if (use_peepholes) {
+      for (int step = tstart; step < max_seq_len; ++step) {
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+        DEFINE_CUR;
+        for (int i = 0; i < cur_bs; ++i) {
+          COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                                cur_h_out_data);
+          MOVE_ONE_BATCH;
         }
-
-        // F_t * C_t-1
-        blas.VMUL(D, cur_in_data + D2, prev_c_data, cur_in_data + D2);
-        // I_t * ~C_t
-        blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D);
-        // C_t = F_t * C_t-1 + I_t * ~C_t
-        blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data);
-
-        if (use_peepholes) {
-          // + W_oc * C_t for peephole connection
-          blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
-          blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
-                    cur_in_data + D3);
-          // O_t
-          act_gate(D, cur_in_data + D3, cur_in_data + D3);
+        MOVE_ONE_STEP;
+      }
+    } else {
+      for (int step = tstart; step < max_seq_len; ++step) {
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+        DEFINE_CUR;
+        for (int i = 0; i < cur_bs; ++i) {
+          COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                       cur_h_out_data);
+          MOVE_ONE_BATCH;
         }
-
-        // hidden out= act_state(cellout) * outgate
-        act_cell(D, cur_c_out_data, cur_in_data + D2);
-        // H_t = O_t * act_state(C_t)
-        blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
-
-        // move to next data in same batch
-        next_data_in_batch();
+        MOVE_ONE_STEP;
       }
-      // move to data for next timestep
-      prev_batch_h_data = cur_batch_h_out_data;
-      prev_batch_c_data = cur_batch_c_out_data;
-      move_step(cur_bs);
     }
+#undef MOVE_ONE_STEP
+#undef MOVE_ONE_BATCH
+#undef DEFINE_CUR
 
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batched_h_out->set_lod(batched_lod);
@@ -615,6 +565,16 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       BatchCompute(ctx);
     }
   }
+
+#undef COMPUTE_CtHt_PEEPHOLE
+#undef COMPUTE_CtHt
+#undef GET_Ct_NOH0C0
+#undef COMPUTE_CtHt_NOH0C0
+#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
+#undef GET_Ht
+#undef GET_Ct
+#undef GEMM_WH_ADDON
+#undef INIT_BASE_INPUT_DATAS
 #undef INIT_BASE_SIZES
 #undef INIT_BASE_INPUT_OUTPUT
 #undef INIT_VEC_FUNC
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 4fbfa6354ab45fed4839227a2a4be8fe147e5fd9..6a3ad2151081504fda2a3818c5f99ad47039d91d 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -121,6 +121,12 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
     if (nullptr == dso_handle) {
       LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
                    << dlerror() << ")";
+      if (dlPath.find("nccl") != std::string::npos) {
+        std::cout
+            << "You may need to install 'nccl2' from NVIDIA official website: "
+            << "https://developer.nvidia.com/nccl/nccl-download"
+            << "before install PaddlePaddle" << std::endl;
+      }
       dlPath = dso_name;
       dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
     }
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2f12411659fab34dc5c2c8fe4b16eba508a05c5c..7481888174b5134a720ecfcaa4996b50f87ca8a2 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -500,7 +500,7 @@ EOF
 EOF
 
     if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&"
+        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
     else
         NCCL_DEPS=""
     fi
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 920dbf3b4ebb0bc3d98c9ea986d7d039deed4a4c..19fc229e6fa84792f58aeeb00be09eb2401b19c7 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -104,7 +104,7 @@ def batch_images_from_tar(data_file,
                 pickle.dump(
                     output,
                     open('%s/batch_%d' % (out_path, file_id), 'wb'),
-                    protocol=pickle.HIGHEST_PROTOCOL)
+                    protocol=2)
                 file_id += 1
                 data = []
                 labels = []
@@ -113,9 +113,7 @@ def batch_images_from_tar(data_file,
         output['label'] = labels
         output['data'] = data
         pickle.dump(
-            output,
-            open('%s/batch_%d' % (out_path, file_id), 'wb'),
-            protocol=pickle.HIGHEST_PROTOCOL)
+            output, open('%s/batch_%d' % (out_path, file_id), 'wb'), protocol=2)
 
     with open(meta_file, 'a') as meta:
         for file in os.listdir(out_path):
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index f6017a455df7e8bd197ef2563a759f843b5e7c73..e1368a3392a9cab3e82eff0a73eb225a52aa03bf 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -47,14 +47,14 @@ def train_program():
     loss = fluid.layers.square_error_cost(input=y_predict, label=y)
     avg_loss = fluid.layers.mean(loss)
 
-    return avg_loss
+    return [avg_loss, y_predict]
 
 
 def optimizer_func():
     return fluid.optimizer.SGD(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, params_dirname, inference_model_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
@@ -74,6 +74,8 @@ def train(use_cuda, train_program, params_dirname):
                 '''
                 if params_dirname is not None:
                     trainer.save_params(params_dirname)
+                    trainer.save_inference_model(inference_model_dirname,
+                                                 ['x'], [1])
                 trainer.stop()
 
     trainer.train(
@@ -99,15 +101,55 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results[0])
 
 
+def infer_by_saved_model(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # The input's dimension should be 2-D and the second dim is 13
+        # The input data should be >= 0
+        batch_size = 10
+
+        test_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=batch_size)
+
+        test_data = next(test_reader())
+        test_feat = numpy.array(
+            [data[0] for data in test_data]).astype("float32")
+        test_label = numpy.array(
+            [data[1] for data in test_data]).astype("float32")
+
+        assert feed_target_names[0] == 'x'
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: numpy.array(test_feat)},
+                          fetch_list=fetch_targets)
+        print("infer shape: ", results[0].shape)
+        print("infer results: ", results[0])
+        print("ground truth: ", test_label)
+
+
 def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
     # Directory for saving the trained model
-    params_dirname = "fit_a_line.inference.model"
+    params_dirname = "fit_a_line.model"
+    inference_model_dirname = "fit_a_line.inference_model"
 
-    train(use_cuda, train_program, params_dirname)
+    train(use_cuda, train_program, params_dirname, inference_model_dirname)
     infer(use_cuda, inference_program, params_dirname)
+    infer_by_saved_model(use_cuda, inference_model_dirname)
 
 
 class TestFitALine(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 58875a1dd19fd91f6f2bed928397ee7f73302dff..c0f5da5a1ae43847dff6348ea5f3e3bfd5e89ab9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -55,6 +55,7 @@ class TestDistRunnerBase(object):
         pserver_prog = t.get_pserver_program(args.current_endpoint)
         startup_prog = t.get_startup_program(args.current_endpoint,
                                              pserver_prog)
+
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup_prog)
@@ -147,6 +148,8 @@ def runtime_main(test_class):
 
 
 import paddle.compat as cpt
+import socket
+from contextlib import closing
 
 
 class TestDistBase(unittest.TestCase):
@@ -156,13 +159,19 @@ class TestDistBase(unittest.TestCase):
     def setUp(self):
         self._trainers = 2
         self._pservers = 2
-        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
         self._python_interp = "python"
         self._sync_mode = True
         self._mem_opt = False
         self._use_reduce = False
         self._setup_config()
 
+    def _find_free_port(self):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.bind(('', 0))
+            return s.getsockname()[1]
+
     def start_pserver(self, model_file, check_error_log):
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
         ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
index 4767e9433ea74d5da83867d646f2a63c9a092668..de0c86f96db958eebd7e74346bec244f0c804ed9 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -53,12 +53,11 @@ class TestFusionLSTMOp(OpTest):
         self.M = 8
         self.D = 16
         self.has_initial_state = False
+        self.use_peepholes = False
         self.is_reverse = False
         self.act_gate = 'sigmoid'
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
-        self.use_peepholes = False
-        self.use_seq = False
         self.set_conf()
 
         T = sum(self.lod[0])
@@ -108,7 +107,6 @@ class TestFusionLSTMOp(OpTest):
         }
         self.attrs = {
             'use_peepholes': self.use_peepholes,
-            'use_seq': self.use_seq,
             'is_reverse': self.is_reverse,
             'gate_activation': self.act_gate,
             'cell_activation': self.act_cell,
@@ -178,50 +176,18 @@ class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
         self.is_reverse = True
 
 
-class TestFusionLSTMOpPoopholesBS1(TestFusionLSTMOp):
+class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp):
     def set_conf(self):
         self.use_peepholes = True
-        self.lod = [[3]]
-        self.D = 16
-
-
-class TestFusionLSTMOpSeqInit(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_seq = True
-        self.has_initial_state = True
-
-
-class TestFusionLSTMOpSeqReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_seq = True
-        self.is_reverse = True
-
-
-class TestFusionLSTMOpSeqInitReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_seq = True
         self.has_initial_state = True
         self.is_reverse = True
 
 
-class TestFusionLSTMOpSeqPeepholes(TestFusionLSTMOp):
+class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
     def set_conf(self):
-        self.use_seq = True
         self.use_peepholes = True
-
-
-class TestFusionLSTMOpSeqPeepholesInit(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_seq = True
-        self.use_peepholes = True
-        self.has_initial_state = True
-
-
-class TestFusionLSTMOpSeqPeepholesReverse(TestFusionLSTMOp):
-    def set_conf(self):
-        self.use_seq = True
-        self.use_peepholes = True
-        self.is_reverse = True
+        self.lod = [[2]]
+        self.D = 8
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 372ef748b2e704fd3858c382e048e51448ed3bd5..a49c5d9b43ae1bffa7cb57764db497f68030b151 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -85,6 +85,7 @@ class TestFetchOp(unittest.TestCase):
                     assert not math.isnan(np.sum(ret[i])) and \
                            not math.isinf(np.sum(ret[i]))
 
+    @unittest.skip(reason="CI timeout")
     def test_fetch_op(self):
         tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
         tst_reader_iter = tst_reader()
@@ -139,6 +140,7 @@ class TestFeedParallel(unittest.TestCase):
             if batch_id == 2:
                 break
 
+    @unittest.skip(reason="CI timeout")
     def test_feed_op(self):
         os.environ['CPU_NUM'] = str(4)
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index d094647afe1900809fc32cae93f777765f72c675..30cdfe4ad2c9892184862b70ff49417ce5a08516 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -431,6 +431,28 @@ class Trainer(object):
             exe = executor.Executor(self.place)
             io.save_persistables(exe, dirname=param_path)
 
+    def save_inference_model(self, param_path, feeded_var_names,
+                             target_var_indexes):
+        """
+        Save model for cpp inference into :code:`param_path`.
+
+        Args:
+            param_path(str): The path to save parameters.
+            feeded_var_names(list(str)): The name of the vars that you
+                need to feed in before run program.
+            target_var_indexes(list(int)): the index of target var that
+                you need to return in trainer.train_func.
+        Returns:
+            None
+        """
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            target_vars = [
+                self.train_func_outputs[index] for index in target_var_indexes
+            ]
+            io.save_inference_model(param_path, feeded_var_names, target_vars,
+                                    exe)
+
     @contextlib.contextmanager
     def _prog_and_scope_guard(self):
         with framework.program_guard(