merge stack op

03acac2b · zlsh80826 · 13288621 · dc84c951 · 03acac2b · 03acac2b
70 changed file
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -45,14 +45,35 @@ inline void InitVarsInScope(const std::vector<VarInfo> &var_infos, Scope *scope,
 // get CommContext and remote send and recv op
 void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
 #ifdef PADDLE_WITH_DISTRIBUTE
-  // init communicator here
-  auto *instance = operators::distributed::Communicator::GetInstance();
-  auto initialized = instance ? true : false;
-  PADDLE_ENFORCE_EQ(initialized, true,
-                    platform::errors::InvalidArgument(
-                        "Communicator is not Initialized, you may use "
-                        "FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
-                        "develop/markdown_doc/transpiler)"));
+
+  bool need_communicator = false;
+
+  for (auto &node : graphs[0]->Nodes()) {
+    VLOG(3) << "node name " << node->Name();
+    if (node && node->IsOp()) {
+      if (node->Name() == "send") {
+        auto send_varnames =
+            BOOST_GET_CONST(std::vector<std::string>,
+                            node->Op()->GetNullableAttr("send_varnames"));
+
+        if (send_varnames.size() > 0) {
+          need_communicator = true;
+          break;
+        }
+      }
+    }
+  }
+
+  if (need_communicator) {
+    // init communicator here
+    auto *instance = operators::distributed::Communicator::GetInstance();
+    auto initialized = instance ? true : false;
+    PADDLE_ENFORCE_EQ(initialized, true,
+                      platform::errors::InvalidArgument(
+                          "Communicator is not Initialized, you may use "
+                          "FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
+                          "develop/markdown_doc/transpiler)"));
+  }

 #endif
 }

--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -19,6 +19,6 @@ else()
    cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_GLOO)

-cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto)
+cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context)

 cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/string/pretty_log.h"

 namespace paddle {
@@ -54,7 +55,7 @@ void LogQuantizationDisabled(Node* op) {
  std::stringstream msg_ss;
  VLOG(4) << "Qantization skipped for operator " << op->Name()
          << " (type: " << op->Op()->Type() << ", id: " << op->id()
-          << "). Attribute use_quantizer = false.";
+          << "). Attribute mkldnn_data_type != \"int8\".";
 }

 }  // namespace
@@ -228,12 +229,12 @@ double CPUQuantizePass::GetScaleValueForNode(const Node* node,

 bool CPUQuantizePass::IsOpDequantized(const Node* node) const {
  return node->Op()->Type() == "dequantize" ||
-         node->Op()->GetAttrIfExists<bool>("use_quantizer");
+         platform::HasOpINT8DataType(node->Op());
 }

 bool CPUQuantizePass::IsOpQuantized(const Node* node) const {
  return node->Op()->Type() == "quantize" ||
-         node->Op()->GetAttrIfExists<bool>("use_quantizer");
+         platform::HasOpINT8DataType(node->Op());
 }

 void CPUQuantizePass::QuantizeConv(Graph* graph,
@@ -248,10 +249,9 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
                     Graph* g) {
    VLOG(4) << "Quantize conv2d op";
    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
-    auto* conv_op_desc = conv_op->Op();

    // skip if should not be quantized
-    if (!conv_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(conv_op->Op())) {
      LogQuantizationDisabled(conv_op);
      return;
    }
@@ -353,14 +353,13 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const {
                     Graph* g) {
    VLOG(4) << "Quantize fc op";
    GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern);
-    auto* fc_op_desc = fc->Op();

    // skip if should not be quantized
-    if (!fc_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(fc->Op())) {
      LogQuantizationDisabled(fc);
      return;
    }
-    if (!fc_op_desc->GetAttrIfExists<bool>("use_mkldnn")) {
+    if (!fc->Op()->GetAttrIfExists<bool>("use_mkldnn")) {
      return;
    }

@@ -420,10 +419,9 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const {
                     Graph* g) {
    VLOG(4) << "Quantize pool2d op";
    GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern);
-    auto* pool_op_desc = pool_op->Op();

    // skip if should not be quantized
-    if (!pool_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(pool_op->Op())) {
      LogQuantizationDisabled(pool_op);
      return;
    }
@@ -465,10 +463,9 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
                     Graph* g) {
    VLOG(4) << "Quantize concat op";
    GET_IR_NODE_FROM_SUBGRAPH(concat_op, concat_op, concat_pattern);
-    auto* concat_op_desc = concat_op->Op();

    // skip if should not be quantized
-    if (!concat_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(concat_op->Op())) {
      LogQuantizationDisabled(concat_op);
      return;
    }
@@ -511,10 +508,9 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const {
                     Graph* g) {
    VLOG(4) << "Quantize prior_box op";
    GET_IR_NODE_FROM_SUBGRAPH(prior_box_op, prior_box_op, prior_box_pattern);
-    auto* prior_box_op_desc = prior_box_op->Op();

    // skip if should not be quantized
-    if (!prior_box_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(prior_box_op->Op())) {
      LogQuantizationDisabled(prior_box_op);
      return;
    }
@@ -554,10 +550,9 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const {
                     Graph* g) {
    VLOG(4) << "Quantize transpose op";
    GET_IR_NODE_FROM_SUBGRAPH(transpose_op, transpose_op, transpose_pattern);
-    auto* transpose_op_desc = transpose_op->Op();

    // skip if should not be quantized
-    if (!transpose_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(transpose_op->Op())) {
      LogQuantizationDisabled(transpose_op);
      return;
    }
@@ -609,10 +604,9 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
                     Graph* g) {
    VLOG(4) << "Quantize reshape op";
    GET_IR_NODE_FROM_SUBGRAPH(reshape_op, reshape_op, reshape_pattern);
-    auto* reshape_op_desc = reshape_op->Op();

    // skip if should not be quantized
-    if (!reshape_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(reshape_op->Op())) {
      LogQuantizationDisabled(reshape_op);
      return;
    }
@@ -662,10 +656,9 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
                     Graph* g) {
    VLOG(4) << "Quantize matmul op";
    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern);
-    auto* matmul_op_desc = matmul_op->Op();

    // skip if should not be quantized
-    if (!matmul_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(matmul_op->Op())) {
      LogQuantizationDisabled(matmul_op);
      return;
    }
@@ -732,10 +725,9 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
    VLOG(4) << "Quantize elementwise_add op";
    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
                              elementwise_add_pattern);
-    auto* elementwise_add_op_desc = elementwise_add_op->Op();

    // skip if should not be quantized
-    if (!elementwise_add_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+    if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) {
      LogQuantizationDisabled(elementwise_add_op);
      return;
    }

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -26,7 +26,7 @@ namespace ir {
 void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
           const std::vector<std::string>& inputs,
           const std::vector<std::string>& outputs, bool use_mkldnn,
-           bool use_quantizer = false) {
+           const std::string& mkldnn_data_type = "float32") {
  auto* op = prog->MutableBlock(0)->AppendOp();
  op->SetType(type);
  op->SetAttr("use_mkldnn", use_mkldnn);
@@ -47,14 +47,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
      op->SetAttr("fuse_residual_connection", false);
    }
    op->SetOutput("Output", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
    op->SetAttr("Scale_in", 1.0f);
    op->SetAttr("Scale_out", 1.0f);
    op->SetAttr("Scale_weights", std::vector<float>{1.0f});
  } else if (type == "pool2d" || type == "transpose2" || type == "reshape2") {
    op->SetInput("X", {inputs[0]});
    op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
  } else if (type == "dropout") {
    op->SetInput("X", {inputs[0]});
    op->SetOutput("Out", {outputs[0]});
@@ -63,14 +63,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
    if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
    op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
    op->SetAttr("Scale_in", 1.0f);
    op->SetAttr("Scale_out", 1.0f);
    op->SetAttr("Scale_weights", std::vector<float>{1.0f});
  } else if (type == "concat") {
    op->SetInput("X", inputs);
    op->SetOutput("Out", outputs);
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
  } else if (type == "dequantize") {
    op->SetInput("Input", {inputs[0]});
    op->SetOutput("Output", {outputs[0]});
@@ -79,7 +79,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
    op->SetInput("X", {inputs[0]});
    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
    op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
    op->SetAttr("Scale_x", 1.0f);
    op->SetAttr("Scale_y", 1.0f);
    op->SetAttr("Scale_out", 1.0f);
@@ -87,7 +87,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
    op->SetInput("X", {inputs[0]});
    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
    op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
    op->SetAttr("Scale_x", 1.0f);
    op->SetAttr("Scale_y", 1.0f);
    op->SetAttr("Scale_out", 1.0f);
@@ -142,7 +142,8 @@ static const std::initializer_list<std::string> variable_names{
 // d->Dropout1->g and (g, w5, b3)->Fc1->h and (h,w3,b1,i)->Conv3->j
 //
 // (d,w4, b2)->Conv4->i
-ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
+ProgramDesc BuildProgramDesc(bool use_mkldnn,
+                             const std::string& mkldnn_data_type) {
  ProgramDesc prog;
  for (auto& v : variable_names) {
    auto* var = prog.MutableBlock(0)->Var(v);
@@ -152,21 +153,21 @@ ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
  }

  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn,
-        use_quantizer);
-  SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer);
+        mkldnn_data_type);
+  SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, mkldnn_data_type);

  SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn,
-        use_quantizer);
-  SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer);
+        mkldnn_data_type);
+  SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, mkldnn_data_type);

  SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
  SetOp(&prog, "fc", "Fc1", {"g", "w5", "b3"}, {"h"}, use_mkldnn,
-        use_quantizer);
+        mkldnn_data_type);
  SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
-        use_quantizer);
+        mkldnn_data_type);

  SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn,
-        use_quantizer);
+        mkldnn_data_type);

  return prog;
 }
@@ -215,7 +216,7 @@ void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,

 TEST(CpuQuantizePass, quantize) {
  bool use_mkldnn = true;
-  bool use_quantizer = true;
+  std::string mkldnn_data_type = "int8";
  // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and
  // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d
  //
@@ -228,16 +229,16 @@ TEST(CpuQuantizePass, quantize) {
  // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
  // Insert nodes: 8 Quant + 8 IN + 7 OUT + 7 DEQUANT
  int added_nodes = 8 + 8 + 7 + 7;
-  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 8, 7, added_nodes,
-           2.0f * 127);
+  MainTest(BuildProgramDesc(use_mkldnn, mkldnn_data_type), 4, 2, 8, 7,
+           added_nodes, 2.0f * 127);
 }

 TEST(CpuQuantizePass, do_not_quantize) {
  bool use_mkldnn = true;
-  bool use_quantizer = false;
+  std::string mkldnn_data_type = "float32";
  int added_nodes = 0;
-  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes,
-           1.0f);
+  MainTest(BuildProgramDesc(use_mkldnn, mkldnn_data_type), 4, 2, 0, 0,
+           added_nodes, 1.0f);
 }

 static const std::initializer_list<std::string> variable_names_concat = {
@@ -250,10 +251,10 @@ static const std::initializer_list<std::string> variable_names_concat = {
 ProgramDesc BuildProgramDescConcat() {
  ProgramDesc prog;

-  SetOp(&prog, "pool2d", "Pool1", {"a1"}, {"b1"}, true, false);
-  SetOp(&prog, "pool2d", "Pool2", {"a2"}, {"b2"}, true, false);
-  SetOp(&prog, "concat", "Concat", {"b1", "b2"}, {"c"}, true, true);
-  SetOp(&prog, "pool2d", "Pool3", {"c"}, {"d"}, true, false);
+  SetOp(&prog, "pool2d", "Pool1", {"a1"}, {"b1"}, true, "float32");
+  SetOp(&prog, "pool2d", "Pool2", {"a2"}, {"b2"}, true, "float32");
+  SetOp(&prog, "concat", "Concat", {"b1", "b2"}, {"c"}, true, "int8");
+  SetOp(&prog, "pool2d", "Pool3", {"c"}, {"d"}, true, "float32");

  return prog;
 }
@@ -321,11 +322,11 @@ ProgramDesc BuildProgramDescTranspose() {
    }
  }

-  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"b"}, true, true);
-  SetOp(&prog, "transpose2", "Transpose1", {"b"}, {"c"}, true, true);
-  SetOp(&prog, "conv2d", "Conv1", {"c", "w2"}, {"d"}, true, true);
-  SetOp(&prog, "transpose2", "Transpose2", {"d"}, {"e"}, true, true);
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
+  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"b"}, true, "int8");
+  SetOp(&prog, "transpose2", "Transpose1", {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "conv2d", "Conv1", {"c", "w2"}, {"d"}, true, "int8");
+  SetOp(&prog, "transpose2", "Transpose2", {"d"}, {"e"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");

  return prog;
 }
@@ -400,8 +401,8 @@ ProgramDesc BuildProgramDescReshape() {
    prog.MutableBlock(0)->Var(v);
  }
  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
-  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, true);
-  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, false);
+  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");

  return prog;
 }
@@ -415,9 +416,9 @@ ProgramDesc BuildProgramDescReshapeBetweenNonQuantizedOp() {
    prog.MutableBlock(0)->Var(v);
  }

-  SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, false);
-  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, true);
-  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, false);
+  SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, "float32");
+  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");

  return prog;
 }
@@ -505,8 +506,8 @@ ProgramDesc BuildProgramDescMatmul() {
  }
  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
  SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
-  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true);
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
+  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");

  return prog;
 }
@@ -518,8 +519,8 @@ ProgramDesc BuildProgramDescMatmulNotQuantized() {
  }
  SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, false);
  SetOp(&prog, "dequantize", "Dequantize", {"c"}, {"d"}, true);
-  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true);
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
+  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");

  return prog;
 }
@@ -590,8 +591,8 @@ ProgramDesc BuildProgramDescElementwiseAdd() {
  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
  SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
  SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true,
-        true);
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
+        "int8");
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");

  return prog;
 }

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -32,11 +32,19 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
                    n->id()) != excluded_ids_list.end())
        continue;
      auto* op = n->Op();
-      if (op->HasAttr("use_quantizer") || op->HasProtoAttr("use_quantizer")) {
+      if (op->HasAttr("mkldnn_data_type") ||
+          op->HasProtoAttr("mkldnn_data_type")) {
+        // use_quantizer is no longer used
+        // assign value for compatibility
+        if (op->GetAttrIfExists<bool>("use_quantizer")) {
+          op->SetAttr("mkldnn_data_type", std::string("int8"));
+        }
        if (op_types_list.empty()) {
+          op->SetAttr("mkldnn_data_type", std::string("int8"));
          op->SetAttr("use_quantizer", true);
        } else if (std::find(op_types_list.begin(), op_types_list.end(),
                             op->Type()) != op_types_list.end()) {
+          op->SetAttr("mkldnn_data_type", std::string("int8"));
          op->SetAttr("use_quantizer", true);
        }
      }

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"

 #include <gtest/gtest.h>
-#include <boost/logic/tribool.hpp>
+#include "paddle/fluid/platform/mkldnn_helper.h"

 namespace paddle {
 namespace framework {
@@ -24,13 +24,11 @@ namespace ir {
 void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
           const std::vector<std::string>& inputs,
           const std::vector<std::string>& outputs,
-           boost::tribool use_quantizer) {
+           const std::string& mkldnn_data_type = "float32") {
  auto* op = prog->MutableBlock(0)->AppendOp();

  op->SetType(type);
-
-  if (!boost::indeterminate(use_quantizer))
-    op->SetAttr("use_quantizer", use_quantizer);
+  op->SetAttr("mkldnn_data_type", mkldnn_data_type);

  if (type == "conv2d") {
    op->SetAttr("name", name);
@@ -50,7 +48,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
  op->SetOutput("Out", {outputs[0]});
 }

-// operator                      use_quantizer
+// operator                      mkldnn_data_type
 // ---------------------------------------
 // (a,b)->concat->c              none
 // (c,weights,bias)->conv->f     false
@@ -71,19 +69,19 @@ ProgramDesc BuildProgramDesc() {
    }
  }

-  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, boost::indeterminate);
-  SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, false);
-  SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, boost::indeterminate);
-  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, false);
-  SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, false);
-  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, false);
+  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, "float32");
+  SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, "float32");
+  SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, "float32");
+  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, "float32");
+  SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, "float32");
+  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, "float32");

  return prog;
 }

 void MainTest(std::initializer_list<std::string> quantize_enabled_op_types,
              std::initializer_list<int> quantize_excluded_op_ids,
-              unsigned expected_use_quantizer_true_count) {
+              unsigned expected_int8_data_type_count) {
  auto prog = BuildProgramDesc();

  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
@@ -96,38 +94,34 @@ void MainTest(std::initializer_list<std::string> quantize_enabled_op_types,

  graph.reset(pass->Apply(graph.release()));

-  unsigned use_quantizer_true_count = 0;
+  unsigned int8_data_type_count = 0;

  for (auto* node : graph->Nodes()) {
    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->HasAttr("use_quantizer") &&
-          BOOST_GET_CONST(bool, op->GetAttr("use_quantizer"))) {
-        ++use_quantizer_true_count;
+      if (platform::HasOpINT8DataType(node->Op())) {
+        ++int8_data_type_count;
      }
    }
  }

-  EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count);
+  EXPECT_EQ(int8_data_type_count, expected_int8_data_type_count);
 }

-void DefaultAttrTest(unsigned expected_use_quantizer_true_count) {
+void DefaultAttrTest(unsigned expected_int8_data_type_count) {
  auto prog = BuildProgramDesc();
  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
  auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass");
  graph.reset(pass->Apply(graph.release()));

-  unsigned use_quantizer_true_count = 0;
+  unsigned int8_data_type_count = 0;
  for (auto* node : graph->Nodes()) {
    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->HasAttr("use_quantizer") &&
-          BOOST_GET_CONST(bool, op->GetAttr("use_quantizer"))) {
-        ++use_quantizer_true_count;
+      if (platform::HasOpINT8DataType(node->Op())) {
+        ++int8_data_type_count;
      }
    }
  }
-  EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count);
+  EXPECT_EQ(int8_data_type_count, expected_int8_data_type_count);
 }

 TEST(QuantizerPlacementPass, enabled_pool) { MainTest({"pool2d"}, {}, 2); }
@@ -137,13 +131,13 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
 }

 TEST(QuantizerPlacementPass, excluded_none) {
-  // 2 conv + 2 pool
-  MainTest({}, {}, 4);
+  // all operators quantized
+  MainTest({}, {}, 6);
 }

 TEST(QuantizerPlacementPass, default_attr_value) {
-  // 2 conv + 2 pool
-  DefaultAttrTest(4);
+  //  all operators quantized
+  DefaultAttrTest(6);
 }

 }  // namespace ir

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1057,4 +1057,5 @@ USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
 USE_TRT_CONVERTER(skip_layernorm);
 USE_TRT_CONVERTER(slice);
 USE_TRT_CONVERTER(scale);
+USE_TRT_CONVERTER(stack);
 #endif
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/pretty_log.h"

@@ -50,8 +51,7 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
  using VariableNameMap = std::map<std::string, std::vector<std::string>>;
  std::map<std::string, std::map<std::string, LoDTensor>> gathered_data;
  for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) {
-    if (op->HasAttr("use_quantizer") &&
-        BOOST_GET_CONST(bool, op->GetAttr("use_quantizer"))) {
+    if (platform::HasOpINT8DataType(op)) {
      const VariableNameMap& connections_in = op->Inputs();
      const VariableNameMap& connections_out = op->Outputs();


--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -3,8 +3,8 @@ nv_library(tensorrt_converter
           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc
-emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc
+                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc
+                emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc
           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)

 nv_test(test_op_converter SRCS test_op_converter.cc DEPS

--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
@@ -58,6 +58,24 @@ class ScaleOpConverter : public OpConverter {
    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                         0};
    nvinfer1::ILayer* layer = nullptr;
+
+    auto input_dim = input->getDimensions();
+    PADDLE_ENFORCE_GE(input_dim.nbDims, 3,
+                      platform::errors::Fatal(
+                          "Paddle-TRT scale mode only support dimension >= 3"));
+
+    nvinfer1::IShuffleLayer* expand_layer = nullptr;
+    nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
+
+    if (input_dim.nbDims == 3) {
+      // TensorRT scale layer is not supporting input dims < 4 when using
+      // explicit batch
+      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      nvinfer1::Dims4 target_shape(0, 0, 0, 1);  // expand 1 dims
+      expand_layer->setReshapeDimensions(target_shape);
+      input = expand_layer->getOutput(0);
+    }
+
    if (bias_after_scale) {
      layer = TRT_ENGINE_ADD_LAYER(
          engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM,
@@ -73,6 +91,18 @@ class ScaleOpConverter : public OpConverter {
          power_weights.get(), scale_weights.get(), power_weights.get());
    }

+    PADDLE_ENFORCE_EQ(layer != nullptr, true,
+                      platform::errors::Fatal("Create scale layer failed."));
+
+    if (input_dim.nbDims == 3) {
+      // TensorRT scale layer is not supporting input dims < 4 when using
+      // explicit batch
+      squeeze_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
+      nvinfer1::Dims3 target_shape(0, 0, 0);  // expand 1 dims
+      squeeze_layer->setReshapeDimensions(target_shape);
+      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+    }
    RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode);
  }
 };

--- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Stack converter from fluid to tensorRT.
+ */
+class StackOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid stack op to tensorrt stack layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    auto input = op_desc.Input("X");
+    int input_num = input.size();
+    nvinfer1::ITensor** inputs =
+        (nvinfer1::ITensor**)malloc(input_num * sizeof(nvinfer1::ITensor*));
+
+    for (int i = 0; i < input_num; ++i) {
+      inputs[i] = engine_->GetITensor(input[i]);
+    }
+
+    int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
+    if (axis < 0) {
+      axis = axis + inputs[0]->getDimensions().nbDims + 1;
+    }
+
+    nvinfer1::ILayer* layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
+      plugin::StackPluginDynamic* plugin =
+          new plugin::StackPluginDynamic(axis, input_num);
+      layer = engine_->AddPluginV2(inputs, input_num, plugin);
+      assert(layer != nullptr);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the TRT Dynamic Shape mode, need to confirm that "
+          "your TRT version is no less than 6.0"));
+#endif
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the Ernie(Bert) model in static"
+          "shape mode, which is not supported for the time being.\n"
+          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
+          " to set the shape information to run the dynamic shape mode."));
+    }
+    auto output_name = op_desc.Output("Y").front();
+    RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode);
+    free(inputs);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(stack, StackOpConverter);
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -86,6 +86,7 @@ struct SimpleOpTypeSetTeller : public Teller {
      "layer_norm",
      "scale",
      "slice",
+      "stack",
  };
 };


--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
 nv_library(tensorrt_plugin
           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
-           prelu_op_plugin.cu  trt_plugin_factory.cc gelu_op_plugin.cu 
+           prelu_op_plugin.cu trt_plugin_factory.cc gelu_op_plugin.cu
           pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu
-           cast_int_plugin.cu
-instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
-qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu
+           cast_int_plugin.cu stack_op_plugin.cu
+           instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
+           qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu
           DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) 
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cstring>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) {
+  nvinfer1::DimsExprs output(inputs[0]);
+  output.nbDims = inputs[0].nbDims + 1;
+
+  for (int i = inputs[0].nbDims; i > axis_; --i) {
+    output.d[i] = inputs[0].d[i - 1];
+  }
+  output.d[axis_] = expr_builder.constant(nb_inputs);
+  return output;
+}
+
+bool StackPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of stack plugin should not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc& in = in_out[pos];
+  if (pos == 0) {
+#ifdef SUPPORTS_CUDA_FP16
+    return (in.type == nvinfer1::DataType::kFLOAT ||
+            in.type == nvinfer1::DataType::kHALF) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+#else
+    return (in.type == nvinfer1::DataType::kFLOAT) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+#endif
+  }
+  const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
+  // output
+  return in.type == prev.type && in.format == prev.format;
+}
+
+nvinfer1::DataType StackPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be equal to 0"));
+  return input_types[0];
+}
+
+template <typename T>
+__global__ void StackKernel(const T* const* input, T* output, int num_stack,
+                            int base_unit) {
+  int stack_id = blockIdx.x;
+  int lead_id = blockIdx.y;
+
+  for (int i = threadIdx.x; i < base_unit; i += blockDim.x) {
+    output[lead_id * num_stack * base_unit + stack_id * base_unit + i] =
+        input[stack_id][lead_id * base_unit + i];
+  }
+}
+
+int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+                                const nvinfer1::PluginTensorDesc* output_desc,
+                                const void* const* inputs, void* const* outputs,
+                                void* workspace, cudaStream_t stream) {
+  auto input_dims = input_desc[0].dims;  // (batch, seq, seq)
+  auto out_dims = output_desc[0].dims;   // (batch, num_head, seq, seq)
+  auto out_num_dims = out_dims.nbDims;
+
+  int base_unit = 1;
+  for (int i = axis_ + 1; i < out_num_dims; ++i) {
+    PADDLE_ENFORCE_GT(out_dims.d[i], 0,
+                      platform::errors::InvalidArgument(
+                          "Input dimensions should be greater than 0"));
+    base_unit *= out_dims.d[i];
+  }
+
+  int lead_unit = 1;
+  for (int i = 0; i < axis_; ++i) {
+    PADDLE_ENFORCE_GT(out_dims.d[i], 0,
+                      platform::errors::InvalidArgument(
+                          "Input dimensions should be greater than 0"));
+    lead_unit *= out_dims.d[i];
+  }
+
+  cudaMemcpyAsync(reinterpret_cast<void*>(in_ptr_gpu_),
+                  reinterpret_cast<const void* const>(inputs),
+                  sizeof(void*) * out_dims.d[axis_], cudaMemcpyHostToDevice,
+                  stream);
+
+  const int num_stacks = out_dims.d[axis_];
+  dim3 num_blocks(num_stacks, lead_unit);
+  const int num_threads = 256;
+  auto infer_type = input_desc[0].type;
+
+  if (infer_type == nvinfer1::DataType::kFLOAT) {
+    float* output = static_cast<float*>(outputs[0]);
+    StackKernel<float><<<num_blocks, num_threads, 0, stream>>>(
+        reinterpret_cast<const float* const*>(in_ptr_gpu_), output, num_stacks,
+        base_unit);
+  } else if (infer_type == nvinfer1::DataType::kHALF) {
+#ifdef SUPPORTS_CUDA_FP16
+    __half* output = static_cast<__half*>(outputs[0]);
+    StackKernel<__half><<<num_blocks, num_threads, 0, stream>>>(
+        reinterpret_cast<const __half* const*>(in_ptr_gpu_), output, num_stacks,
+        base_unit);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The cuda archs you specific should greater than 600."));
+#endif
+  } else {
+    PADDLE_THROW(
+        platform::errors::Fatal("The Stack TRT Plugin's input type only "
+                                "support float or half currently."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdio.h>
+#include <cassert>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class StackPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  StackPluginDynamic(int axis, int num_stack)
+      : axis_(axis), num_stack_(num_stack) {
+    init();
+  }
+
+  StackPluginDynamic(void const* serialData, size_t serialLength) {
+    DeserializeValue(&serialData, &serialLength, &axis_);
+    DeserializeValue(&serialData, &serialLength, &num_stack_);
+    init();
+  }
+
+  ~StackPluginDynamic() {}
+  nvinfer1::IPluginV2DynamicExt* clone() const override {
+    return new StackPluginDynamic(axis_, num_stack_);
+  }
+
+  void init() {
+    int device_id;
+    cudaGetDevice(&device_id);
+    in_ptr_tensor_.Resize({num_stack_});
+    in_ptr_gpu_ =
+        in_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
+  }
+
+  const char* getPluginType() const override { return "stack_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override { return 0; }
+
+  size_t getSerializationSize() const override {
+    size_t serialize_size = 0;
+
+    serialize_size += SerializedSize(axis_);
+    serialize_size += SerializedSize(num_stack_);
+
+    return serialize_size;
+  }
+
+  void serialize(void* buffer) const override {
+    SerializeValue(&buffer, axis_);
+    SerializeValue(&buffer, num_stack_);
+  }
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override {
+    return 0;
+  }
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  void destroy() override { delete this; }
+
+ private:
+  int axis_;
+  int num_stack_;
+  framework::Tensor in_ptr_tensor_;
+  int64_t* in_ptr_gpu_;
+};
+
+class StackPluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  StackPluginV2Creator() {}
+  const char* getPluginName() const override { return "stack_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    int axis = -1;
+    int num_stack = -1;
+
+    for (int i = 0; i < fc->nbFields; ++i) {
+      const std::string name(fc->fields[i].name);
+      if (name == "axis") {
+        axis = static_cast<const int*>(fc->fields[i].data)[0];
+      } else if (name == "num_stack") {
+        num_stack = static_cast<const int*>(fc->fields[i].data)[0];
+      } else {
+        PADDLE_THROW(
+            platform::errors::Fatal("Meet an unknown plugin field '" + name +
+                                    "' when creating stack op plugin."));
+      }
+    }
+    return new StackPluginDynamic(axis, num_stack);
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new StackPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(StackPluginV2Creator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -90,7 +90,6 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {

  config.SwitchUseFeedFetchOps(false);

-  int head_number = 12;
  int batch = 1;
  int min_seq_len = 1;
  int max_seq_len = 128;
@@ -104,17 +103,17 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
      {"read_file_0.tmp_0", min_shape},
      {"read_file_0.tmp_1", min_shape},
      {"read_file_0.tmp_2", min_shape},
-      {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
+      {"matmul_0.tmp_0", {batch, min_seq_len, min_seq_len}}};
  std::map<std::string, std::vector<int>> max_input_shape = {
      {"read_file_0.tmp_0", max_shape},
      {"read_file_0.tmp_1", max_shape},
      {"read_file_0.tmp_2", max_shape},
-      {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
+      {"matmul_0.tmp_0", {batch, max_seq_len, max_seq_len}}};
  std::map<std::string, std::vector<int>> opt_input_shape = {
      {"read_file_0.tmp_0", opt_shape},
      {"read_file_0.tmp_1", opt_shape},
      {"read_file_0.tmp_2", opt_shape},
-      {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
+      {"matmul_0.tmp_0", {batch, opt_seq_len, opt_seq_len}}};

  auto precision = AnalysisConfig::Precision::kFloat32;
  if (with_fp16) {

--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -90,7 +90,6 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {

  config.SwitchUseFeedFetchOps(false);

-  int head_number = 12;
  int batch = 1;
  int min_seq_len = 1;
  int max_seq_len = 128;
@@ -104,17 +103,17 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
      {"read_file_0.tmp_0", min_shape},
      {"read_file_0.tmp_1", min_shape},
      {"read_file_0.tmp_2", min_shape},
-      {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
+      {"matmul_0.tmp_0", {batch, min_seq_len, min_seq_len}}};
  std::map<std::string, std::vector<int>> max_input_shape = {
      {"read_file_0.tmp_0", max_shape},
      {"read_file_0.tmp_1", max_shape},
      {"read_file_0.tmp_2", max_shape},
-      {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
+      {"matmul_0.tmp_0", {batch, max_seq_len, max_seq_len}}};
  std::map<std::string, std::vector<int>> opt_input_shape = {
      {"read_file_0.tmp_0", opt_shape},
      {"read_file_0.tmp_1", opt_shape},
      {"read_file_0.tmp_2", opt_shape},
-      {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
+      {"matmul_0.tmp_0", {batch, opt_seq_len, opt_seq_len}}};

  auto precision = AnalysisConfig::Precision::kFloat32;
  if (with_fp16) {

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -199,7 +199,7 @@ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 UNUSED constexpr char SqrtDoc[] = R"DOC(
 Sqrt Activation Operator.

-.. math:: out=\sqrt x=x^{1/2}
+.. math:: out=\\sqrt{x}=x^{1/2}

 **Note**:
  input value must be greater than or equal to zero.
@@ -211,7 +211,7 @@ Rsqrt Activation Operator.

 Please make sure input is legal in case of numeric errors.

-$$out = \frac{1}{\sqrt{x}}$$
+$$out = \\frac{1}{\\sqrt{x}}$$

 )DOC";


--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -122,12 +122,16 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
             "It has higher priority than Attr(axis). "
             "The shape of AxisTensor must be [1].")
        .AsDispensable();
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
    AddComment(R"DOC(
 Concat Operator.


--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -279,12 +279,16 @@ void Conv2DOpMaker::Make() {
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
-  AddAttr<bool>("use_quantizer",
-                "(bool, default false) "
-                "Set to true for operators that should be quantized and use "
-                "int8 kernel. "
-                "Only used on CPU.")
+  AddAttr<bool>(
+      "use_quantizer",
+      "(bool, default false) "
+      "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
      .SetDefault(false);
+  AddAttr<std::string>(
+      "mkldnn_data_type",
+      "(string, default \"float32\"). Data type of mkldnn kernel")
+      .SetDefault("float32")
+      .InEnum({"float32", "int8", "bfloat16"});
  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
  AddAttr<bool>("fuse_brelu",

--- a/paddle/fluid/operators/detection/prior_box_op.cc
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */

 #include "paddle/fluid/operators/detection/prior_box_op.h"

+#include <string>
+
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -218,12 +220,16 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<bool>("use_mkldnn",
                  "(bool, default false) Only used in mkldnn kernel")
        .SetDefault(false);
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
    AddComment(R"DOC(
 Prior box operator
 Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.

--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -140,12 +140,17 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault("");
    AddAttr<std::string>("y_data_format", "This parameter is no longer used.")
        .SetDefault("");
-    /* int8 parameters */
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
+    /* int8 parameters */
    AddAttr<float>("Scale_x",
                   "(float, default 1.0f), The quantize scale of X tensor")
        .SetDefault(1.0f);

--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -142,13 +142,17 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
                  "Skip calling InferShape() function in the runtime.")
        .SetDefault(true);
-    /* int8 parameters */
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
+    /* int8 parameters */
    AddAttr<float>("Scale_in",
                   "(float, default 1.0f), The quantize scale of input data")
        .SetDefault(1.0f);

--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -535,13 +535,17 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
        R"DOC(When MKLDNN MatMul_transpose_reshape fuse activated, "
              "it's a axis atribute of fused transpose for `Out` output.)DOC")
        .SetDefault({});
-    /* int8 parameters */
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
+    /* int8 parameters */
    AddAttr<float>("Scale_x",
                   "(float, default 1.0f), The quantize scale of X tensor")
        .SetDefault(1.0f);

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -306,12 +306,16 @@ void Pool2dOpMaker::Make() {
  AddAttr<bool>("use_mkldnn",
                "(bool) Only used in mkldnn kernel. Default False")
      .SetDefault(false);
-  AddAttr<bool>("use_quantizer",
-                "(bool) "
-                "Set to true for operators that should be quantized and use "
-                "int8 kernel. "
-                "Only used on CPU. Default False")
+  AddAttr<bool>(
+      "use_quantizer",
+      "(bool, default false) "
+      "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
      .SetDefault(false);
+  AddAttr<std::string>(
+      "mkldnn_data_type",
+      "(string, default \"float32\"). Data type of mkldnn kernel")
+      .SetDefault("float32")
+      .InEnum({"float32", "int8", "bfloat16"});
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -431,13 +431,16 @@ class Reshape2OpMaker : public ReshapeOpMaker {
              "XShape is just used to store the shape and lod of X, which will "
              "be used in FlattenGradOp.")
        .AsIntermediate();
-    /* int8 parameters */
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Used only on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
  }
 };


--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -304,6 +304,7 @@ REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
 REGISTER_OP_CPU_KERNEL(
    squeeze, ops::SqueezeKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int>,
    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -311,12 +312,14 @@ REGISTER_OP_CPU_KERNEL(
    squeeze_grad,
    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
    squeeze2, ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, float>,
    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, bool>,
    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int>,
    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -324,6 +327,7 @@ REGISTER_OP_CPU_KERNEL(
    squeeze2_grad,
    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ b/paddle/fluid/operators/squeeze_op.cu.cc
@@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL(
    squeeze, ops::SqueezeKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int>,
    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -29,6 +30,7 @@ REGISTER_OP_CUDA_KERNEL(
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -36,6 +38,7 @@ REGISTER_OP_CUDA_KERNEL(
    squeeze2, ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, float>,
    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, double>,
    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, bool>,
    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int>,
    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -44,6 +47,7 @@ REGISTER_OP_CUDA_KERNEL(
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -108,13 +108,17 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
        "Defaults to \"NHWC\". Specify the data format of the output data, "
        "the input will be transformed automatically. ")
        .SetDefault("AnyLayout");
-    /* int8 parameters */
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
+    AddAttr<bool>(
+        "use_quantizer",
+        "(bool, default false) "
+        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
+    /* int8 parameters */
    AddComment(R"DOC(
 Transpose Operator.


--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -33,6 +33,7 @@ limitations under the License. */
 #include <curand.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
+
 #include "paddle/fluid/platform/cuda_error.pb.h"
 #endif  // PADDLE_WITH_CUDA

@@ -69,6 +70,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"

+DECLARE_int32(call_stack_level);
+
 namespace paddle {
 namespace platform {

@@ -226,9 +229,7 @@ inline std::string SimplifyDemangleStr(std::string str) {
  return str;
 }

-template <typename StrType>
-inline std::string GetTraceBackString(StrType&& what, const char* file,
-                                      int line) {
+inline std::string GetCurrentTraceBackString() {
  static constexpr int TRACE_STACK_LIMIT = 100;
  std::ostringstream sout;

@@ -256,6 +257,13 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
 #else
  sout << "Windows not support stack backtrace yet.\n";
 #endif
+  return sout.str();
+}
+
+template <typename StrType>
+inline std::string GetErrorSumaryString(StrType&& what, const char* file,
+                                        int line) {
+  std::ostringstream sout;
  sout << "\n----------------------\nError Message "
          "Summary:\n----------------------\n";
  sout << string::Sprintf("%s at (%s:%d)", std::forward<StrType>(what), file,
@@ -264,6 +272,17 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
  return sout.str();
 }

+template <typename StrType>
+inline std::string GetTraceBackString(StrType&& what, const char* file,
+                                      int line) {
+  if (FLAGS_call_stack_level > 1) {
+    // FLAGS_call_stack_level>1 means showing c++ call stack
+    return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line);
+  } else {
+    return GetErrorSumaryString(what, file, line);
+  }
+}
+
 inline bool is_error(bool stat) { return !stat; }

 inline void throw_on_error(bool stat, const std::string& msg) {
@@ -427,7 +446,7 @@ struct EnforceNotMet : public std::exception {
 *
 * Examples:
 *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
-*/
+ */
 #define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)                   \
  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {          \
    auto* __ptr = (__PTR);                                                  \
@@ -463,7 +482,7 @@ struct EnforceNotMet : public std::exception {
 *
 * Examples:
 *    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul");
-*/
+ */
 #define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE)                   \
  do {                                                                      \
    PADDLE_ENFORCE_EQ(__EXPR, true, paddle::platform::errors::NotFound(     \
@@ -491,7 +510,7 @@ struct EnforceNotMet : public std::exception {
 * Note: GCC 4.8 cannot select right overloaded function here, so need
 *    to define different functions and macros here, after we upgreade
 *    CI gcc version, we can only define one BOOST_GET macro.
-*/
+ */
 namespace details {

 #define DEFINE_SAFE_BOOST_GET(__InputType, __OutputType, __OutputTypePtr,      \

--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -483,3 +483,28 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0,  // MBytes
 * Note:
 */
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
+
+/**
+ * Debug related FLAG
+ * Name: FLAGS_call_stack_level
+ * Since Version: 2.0.0
+ * Value Range: int, default=2
+ * Example:
+ * Note: Used to debug. Determine the call stack to print when error or
+ * exeception happens.
+ * If FLAGS_call_stack_level == 0, only the error message summary will be shown.
+ * If FLAGS_call_stack_level == 1, the python stack and  error message summary
+ * will be shown.
+ * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
+ * message summary will be shown.
+ */
+DEFINE_int32(
+    call_stack_level, 2,
+    "Determine the call stack to print when error or exeception happens."
+    // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
+    // "If FLAGS_call_stack_level == 0, only the error message summary will be "
+    // "shown. "
+    "If FLAGS_call_stack_level == 1, the python stack and error message "
+    "summary will be shown."
+    "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
+    "error message summary will be shown.");
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -422,6 +422,11 @@ inline std::vector<std::vector<int64_t>> ToMkldnnPadding(
  }
 }

+inline bool HasOpINT8DataType(const paddle::framework::OpDesc* op) {
+  return (op->GetAttrIfExists<std::string>("mkldnn_data_type") == "int8" ||
+          op->GetAttrIfExists<bool>("use_quantizer"));
+}
+
 enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP };

 }  // namespace platform

--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/pybind/global_value_getter_setter.h"
+
 #include <cctype>
 #include <functional>
 #include <string>
@@ -20,6 +21,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -35,6 +37,7 @@ DECLARE_bool(cpu_deterministic);
 DECLARE_bool(enable_rpc_profiler);
 DECLARE_int32(multiple_of_cupti_buffer_size);
 DECLARE_bool(reader_queue_speed_test_mode);
+DECLARE_int32(call_stack_level);
 // device management
 DECLARE_int32(paddle_num_threads);
 // executor
@@ -337,14 +340,15 @@ static void RegisterGlobalVarGetterSetter() {
  REGISTER_PUBLIC_GLOBAL_VAR(
      FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
      FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
-      FLAGS_cpu_deterministic, FLAGS_enable_rpc_profiler,
-      FLAGS_multiple_of_cupti_buffer_size, FLAGS_reader_queue_speed_test_mode,
-      FLAGS_pe_profile_fname, FLAGS_print_sub_graph_dir,
-      FLAGS_fraction_of_cpu_memory_to_use, FLAGS_fuse_parameter_groups_size,
-      FLAGS_fuse_parameter_memory_size, FLAGS_init_allocated_mem,
-      FLAGS_initial_cpu_memory_in_mb, FLAGS_memory_fraction_of_eager_deletion,
-      FLAGS_use_pinned_memory, FLAGS_benchmark, FLAGS_inner_op_parallelism,
-      FLAGS_tracer_profile_fname, FLAGS_paddle_num_threads);
+      FLAGS_call_stack_level, FLAGS_cpu_deterministic,
+      FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
+      FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
+      FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
+      FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size,
+      FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
+      FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
+      FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
+      FLAGS_paddle_num_threads);

 #ifdef PADDLE_WITH_CUDA
  REGISTER_PUBLIC_GLOBAL_VAR(

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -166,17 +166,34 @@ def __bootstrap__():
    os.environ['OMP_NUM_THREADS'] = str(num_threads)
    sysstr = platform.system()
    read_env_flags = [
-        'check_nan_inf', 'fast_check_nan_inf', 'benchmark',
-        'eager_delete_scope', 'fraction_of_cpu_memory_to_use',
-        'initial_cpu_memory_in_mb', 'init_allocated_mem', 'paddle_num_threads',
-        'dist_threadpool_size', 'eager_delete_tensor_gb',
-        'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
-        'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
-        'enable_parallel_graph', 'fuse_parameter_groups_size',
-        'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
-        'tracer_profile_fname', 'dygraph_debug', 'use_system_allocator',
-        'enable_unused_var_check', 'free_idle_chunk', 'free_when_no_cache_hit'
+        'check_nan_inf',
+        'fast_check_nan_inf',
+        'benchmark',
+        'eager_delete_scope',
+        'fraction_of_cpu_memory_to_use',
+        'initial_cpu_memory_in_mb',
+        'init_allocated_mem',
+        'paddle_num_threads',
+        'dist_threadpool_size',
+        'eager_delete_tensor_gb',
+        'fast_eager_deletion_mode',
+        'memory_fraction_of_eager_deletion',
+        'allocator_strategy',
+        'reader_queue_speed_test_mode',
+        'print_sub_graph_dir',
+        'pe_profile_fname',
+        'inner_op_parallelism',
+        'enable_parallel_graph',
+        'fuse_parameter_groups_size',
+        'multiple_of_cupti_buffer_size',
+        'fuse_parameter_memory_size',
+        'tracer_profile_fname',
+        'dygraph_debug',
+        'use_system_allocator',
+        'enable_unused_var_check',
+        'free_idle_chunk',
+        'free_when_no_cache_hit',
+        'call_stack_level',
    ]
    if 'Darwin' not in sysstr:
        read_env_flags.append('use_pinned_memory')
@@ -208,12 +225,19 @@ def __bootstrap__():

    if core.is_compiled_with_cuda():
        read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
-            'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
-            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search', 'selected_gpus', 'sync_nccl_allreduce',
-            'cudnn_batchnorm_spatial_persistent', 'gpu_allocator_retry_time',
-            'local_exe_sub_scope_limit', 'gpu_memory_limit_mb'
+            'fraction_of_gpu_memory_to_use',
+            'initial_gpu_memory_in_mb',
+            'reallocate_gpu_memory_in_mb',
+            'cudnn_deterministic',
+            'enable_cublas_tensor_op_math',
+            'conv_workspace_size_limit',
+            'cudnn_exhaustive_search',
+            'selected_gpus',
+            'sync_nccl_allreduce',
+            'cudnn_batchnorm_spatial_persistent',
+            'gpu_allocator_retry_time',
+            'local_exe_sub_scope_limit',
+            'gpu_memory_limit_mb',
        ]
    core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
    core.init_glog(sys.argv[0])

--- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
@@ -20,13 +20,13 @@ from paddle.fluid.dygraph.dygraph_to_static.origin_info import Location, OriginI
 ERROR_DATA = "Error data about original source code information and traceback."


-def attach_error_data(error):
+def attach_error_data(error, in_runtime=False):
    """
    Attachs error data about original source code information and traceback to an error.

    Args:
        error(Exception): An native error.
-
+        in_runtime(bool): `error` is raised in runtime if in_runtime is True, otherwise in compile time
    Returns:
        An error attached data about original source code information and traceback.
    """
@@ -34,6 +34,8 @@ def attach_error_data(error):
    tb = traceback.extract_tb(e_traceback)[1:]

    error_data = ErrorData(e_type, e_value, tb, global_origin_info_map)
+    error_data.in_runtime = in_runtime
+
    setattr(error, ERROR_DATA, error_data)

    return error
@@ -53,8 +55,6 @@ class TraceBackFrame(OriginInfo):
 class ErrorData(object):
    """
    Error data attached to an exception which is raised in un-transformed code.
-
-    TODO(liym27): Consider the case that op_callstack when error raised from c++ code
    """

    def __init__(self, error_type, error_value, origin_traceback,
@@ -63,6 +63,7 @@ class ErrorData(object):
        self.error_value = error_value
        self.origin_traceback = origin_traceback
        self.origin_info_map = origin_info_map
+        self.in_runtime = False

    def create_exception(self):
        message = self.create_message()
@@ -81,6 +82,12 @@ class ErrorData(object):
        message_lines.append(header_message)
        message_lines.append("")

+        # Simplify error value to improve readability if error is raised in runtime
+        if self.in_runtime:
+            self._simplify_error_value()
+            message_lines.append(str(self.error_value))
+            return '\n'.join(message_lines)
+
        # Step2: Optimizes stack information with source code information of dygraph from user.
        for filepath, lineno, funcname, code in self.origin_traceback:
            loc = Location(filepath, lineno)
@@ -102,3 +109,25 @@ class ErrorData(object):
        message_lines.append(error_message)

        return '\n'.join(message_lines)
+
+    def _simplify_error_value(self):
+        """
+        Simplifies error value to improve readability if error is raised in runtime.
+
+        NOTE(liym27): The op callstack information about transformed static code has been replaced with original dygraph code.
+
+        TODO(liym27):
+            1. Need a more robust way because the code of start_trace may change.
+            2. Set the switch to determine whether to simplify error_value
+        """
+        assert self.in_runtime is True
+
+        error_value_lines = str(self.error_value).split("\n")
+        error_value_lines_strip = [mes.lstrip(" ") for mes in error_value_lines]
+
+        start_trace = "outputs = static_func(*inputs)"
+        start_idx = error_value_lines_strip.index(start_trace)
+        error_value_lines = error_value_lines[start_idx + 1:]
+
+        error_value_str = '\n'.join(error_value_lines)
+        self.error_value = self.error_type(error_value_str)
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -19,6 +19,9 @@ import inspect

 import gast

+from paddle.fluid import core
+from paddle.fluid.framework import Program
+
 # NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node.
 ORIGI_INFO = "Original information of source code for ast node."
 ORIGI_INFO_MAP = "Original information map of source code."
@@ -70,6 +73,10 @@ class OriginInfo(object):
            self.location.filepath, self.location.lineno, self.function_name,
            self.source_code.lstrip())

+    def as_frame(self):
+        return (self.location.filepath, self.location.lineno,
+                self.function_name, self.source_code.lstrip())
+

 class OriginInfoAttacher(gast.NodeTransformer):
    """
@@ -249,3 +256,63 @@ def ast_walk(transformed_node, static_node):
                    if isinstance(d_item, gast.AST):
                        transformed_node_list.append(d_item)
                        static_node_list.append(s_item)
+
+
+def update_op_callstack_with_origin_info(program):
+    """
+    Replaces op callstack information about transformed static code with original dygraph code.
+    """
+
+    assert isinstance(program, Program)
+
+    def get_new_op_callstack(callstack):
+        """
+        An example of callstack:
+
+            File "path1/to/file.py", line 10, in func_1
+                y = fluid.layers.fill_constant(x, shape=[1], dtype="int32")
+            File "path2/to/file.py", line 740, in fill_constant
+                stop_gradient=True)
+            File "path3/to/file.py", line 43, in append_op
+              return self.main_program.current_block().append_op(*args, **kwargs)
+            File "path4/to/file.py", line 2811, in append_op
+              attrs=kwargs.get("attrs", None))
+            File "path5/to/file.py", line 1919, in __init__
+              for frame in traceback.extract_stack():
+        """
+
+        assert len(callstack) % 2 == 0
+        for i in range(0, len(callstack), 2):
+
+            file_line = callstack[i].lstrip(" ").split(",")
+
+            filepath = file_line[0][6:-1]
+            lineno = int(file_line[1][6:])
+            funcname = file_line[2][4:]
+            code = callstack[i + 1].lstrip(" ")
+
+            loc = Location(filepath, lineno)
+            dygraph_func_info = global_origin_info_map.get(loc.line_location)
+            if dygraph_func_info:
+                filepath, lineno, funcname, code = \
+                    dygraph_func_info.as_frame()
+
+            callstack[i] = '  File "{}", line {}, in {}'.format(
+                filepath, lineno, funcname)
+            callstack[i + 1] = '    {}'.format(code)
+
+        return callstack
+
+    op_maker = core.op_proto_and_checker_maker
+    callstack_var_name = op_maker.kOpCreationCallstackAttrName()
+
+    for block in program.blocks:
+        for i, op in enumerate(block.ops):
+            if op.has_attr(callstack_var_name):
+                callstack = op.attr(callstack_var_name)
+
+                callstack = get_new_op_callstack(callstack)
+
+                op._set_attr(callstack_var_name, callstack)
+
+    return program
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -130,8 +130,6 @@ class PartialProgramLayer(layers.Layer):
        self._check_params_all_inited(main_program)
        # 2. Prune the parameters not used anywhere in the program.
        self._prune_unused_params(main_program)
-        # 3. Remove op's python call stack with redundant low-level error messages.
-        main_program = self._remove_op_call_stack(main_program)

        return main_program


--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -37,6 +37,7 @@ from paddle.fluid.dygraph.base import param_guard
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info, create_and_update_origin_info_map
+from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info
 from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data, ERROR_DATA

 __all__ = ['ProgramTranslator', 'convert_to_static']
@@ -304,6 +305,8 @@ class ConcreteProgram(object):
                                  (tuple, list)) and outputs is not None:
                    outputs = [outputs]

+        main_program = update_op_callstack_with_origin_info(main_program)
+
        return ConcreteProgram(
            inputs=inputs,
            outputs=outputs,
@@ -516,7 +519,7 @@ class ProgramTranslator(object):
            # 2. If e raised in runtime, e should be attached to ERROR_DATA here.
            if not hasattr(e, ERROR_DATA):
                # runtime error
-                attach_error_data(e)
+                attach_error_data(e, in_runtime=True)
            raise

    def get_func(self, dygraph_func):

--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -176,7 +176,17 @@ def _declarative_(dygraph_func):
            error_data = getattr(e, ERROR_DATA, None)
            if error_data:
                new_exception = error_data.create_exception()
-                raise new_exception
+                if six.PY3:
+                    # NOTE(liym27):
+                    # 1. Why `raise new_exception from None`?
+                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
+                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
+                    #   caught exception.
+                    # 2. Use exec to bypass syntax error checking in Python 2.
+
+                    six.exec_("raise new_exception from None")
+                else:
+                    raise new_exception
            else:
                raise


--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -25,11 +25,13 @@ import six
 from .data_feeder import convert_dtype
 from .framework import Program, default_main_program, Variable, Operator, convert_np_dtype_to_dtype_
 from . import core
+from . import unique_name
 from . import compiler
 from .. import compat as cpt
 from .trainer_factory import TrainerFactory
 from .trainer_factory import FetchHandlerMonitor
 import copy
+from .incubate.checkpoint import auto_checkpoint as acp

 __all__ = ['Executor', 'global_scope', 'scope_guard']

@@ -559,6 +561,9 @@ class Executor(object):
        self._closed = False
        self.pruned_program_scope_caches = dict()

+        self._auto_checkpoint_name = unique_name.generate(
+            "__auto_checkpoint_executor__")
+
    def _get_scope_cache(self, program_cache_key):
        return self.scope_caches.get(program_cache_key, None)

@@ -1152,6 +1157,8 @@ class Executor(object):

        compiled = isinstance(program, compiler.CompiledProgram)

+        acp._auto_checkpoint(self, program)
+
        # For backward compatibility, run directly.
        if not compiled:
            # In distributed training, the compiled program is saved in Program._graph

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2385,12 +2385,29 @@ class Operator(object):
    def _is_optimize_op(self):
        op_maker = core.op_proto_and_checker_maker
        OPTIMIZE = core.op_proto_and_checker_maker.OpRole.Optimize
+
+        if not self.desc.has_attr(op_maker.kOpRoleAttrName()):
+            return False
+
        op_role = self.desc.attr(op_maker.kOpRoleAttrName())
        if op_role & int(OPTIMIZE):
            return True
-        else:
+
+        return False
+
+    def _is_backward_op(self):
+        op_maker = core.op_proto_and_checker_maker
+        BACKWARD = core.op_proto_and_checker_maker.OpRole.Backward
+
+        if not self.desc.has_attr(op_maker.kOpRoleAttrName()):
            return False

+        op_role = self.desc.attr(op_maker.kOpRoleAttrName())
+        if op_role & int(BACKWARD):
+            return True
+
+        return False
+

 class Block(object):
    """
@@ -3942,6 +3959,10 @@ class Program(object):
        # appending gradients times
        self._appending_grad_times = 0

+        # identifier for auto checkpoint
+        self._auto_checkpoint_name = unique_name.generate(
+            "__auto_checkpoint_program__")
+
        # compiled program, i.e. Graph
        self._graph = None


--- a/python/paddle/fluid/incubate/checkpoint/__init__.py
+++ b/python/paddle/fluid/incubate/checkpoint/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
--- a/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
+++ b/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fleet.utils.fs import FS, LocalFS
+from ..fleet.utils.hdfs import HDFSClient
+from ...compiler import CompiledProgram
+
+
+class SerializableBase(object):
+    def serialize(self, path):
+        raise NotImplementedError
+
+    def deserialize(self, path):
+        raise NotImplementedError
+
+
+class PaddleModel(SerializableBase):
+    def __init__(self, exe, program):
+        self._exe = exe
+        self._origin_program = program
+        self._program = program
+        if isinstance(program, CompiledProgram):
+            self._program = program._program
+
+        self._file_name = "_paddle_fleet_param__"
+
+    def serialize(self, path):
+        from ...io import save_persistables
+        save_persistables(
+            executor=self._exe,
+            dirname=path,
+            main_program=self._program,
+            filename=self._file_name)
+
+    def deserialize(self, path):
+        from ...io import load_persistables
+        load_persistables(
+            executor=self._exe,
+            dirname=path,
+            main_program=self._program,
+            filename=self._file_name)
+
+
+class CheckpointSaver(object):
+    def __init__(self, fs):
+        self._fs = fs
+        self._checkpoint_prefix = "__paddle_checkpoint__"
+
+    def save_checkpoint(self,
+                        path,
+                        slists,
+                        trainer_id=None,
+                        local_cache_path=".cache"):
+        """
+        Serialize objects in slists to path
+        Return really saved path and checkpoint_no
+        """
+        if not self._fs.is_exist(path):
+            self._fs.mkdirs(path)
+        else:
+            assert self._fs.is_dir(path), "path:{} must be a directory".format(
+                path)
+
+        max_no = self._get_last_checkpoint_no(path)
+        if max_no < 0:
+            max_no = -1
+        max_no += 1
+
+        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no)
+        tmp_path = "{}.tmp".format(real_path)
+        saved_path = tmp_path
+
+        local_fs = LocalFS()
+
+        cache_path = None
+        if self._fs.need_upload_download():
+            cache_path = "{}/{}.{}.saved_cache".format(
+                local_cache_path, self._checkpoint_prefix, max_no)
+
+            if trainer_id is not None:
+                cache_path = "{}.{}".format(cache_path, trainer_id)
+
+            if not local_fs.is_exist(cache_path):
+                local_fs.mkdirs(cache_path)
+            else:
+                assert local_fs.is_dir(cache_path), \
+                    "cache path:{} must be a directory".format(cache_path)
+
+            saved_path = cache_path
+
+        for s in slists:
+            s.serialize(saved_path)
+
+        if self._fs.need_upload_download():
+            self._fs.delete(tmp_path)
+            self._fs.upload(cache_path, tmp_path)
+            local_fs.delete(cache_path)
+        self._fs.mv(tmp_path, real_path)
+
+        return real_path, max_no
+
+    def load_checkpoint(self,
+                        path,
+                        slists,
+                        trainer_id,
+                        local_cache_path=".cache",
+                        checkpoint_no=None,
+                        ignore_empty=True):
+        """
+        Deserialize objects in slists from path
+        Return really load path
+        """
+
+        if checkpoint_no is None:
+            max_no = self._get_last_checkpoint_no(path)
+
+            if not ignore_empty:
+                assert max_no >= 0, "Can't find checkpoint"
+
+            if max_no < 0:
+                return None
+
+            checkpoint_no = max_no
+        else:
+            assert isinstance(checkpoint_no, int)
+            assert checkpoint_no >= 0
+
+        local_fs = LocalFS()
+        if self._fs.need_upload_download():
+            cache_path = "{}/{}.{}.load_cache".format(
+                local_cache_path, self._checkpoint_prefix, checkpoint_no)
+
+            if trainer_id is not None:
+                cache_path = "{}.{}".format(cache_path, trainer_id)
+
+            if not local_fs.is_exist(local_cache_path):
+                local_fs.mkdirs(local_cache_path)
+            if local_fs.is_exist(cache_path):
+                local_fs.delete(cache_path)
+
+        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix,
+                                      checkpoint_no)
+        load_path = real_path
+        if self._fs.need_upload_download():
+            self._fs.download(real_path, cache_path)
+            load_path = cache_path
+
+        for s in slists:
+            s.deserialize(load_path)
+
+        if self._fs.need_upload_download() and cache_path:
+            local_fs.delete(cache_path)
+
+        return real_path
+
+    def get_checkpoint_no(self, root_path):
+        a = []
+        dirs = self._fs.list_dirs(root_path)
+        for d in dirs:
+            g = d.split(".")
+            if len(g) != 2:
+                continue
+
+            if g[0] != self._checkpoint_prefix:
+                continue
+
+            try:
+                n = int(g[1])
+                a.append(n)
+            except:
+                continue
+
+        a.sort()
+        return a
+
+    def _get_last_checkpoint_no(self, root_path):
+        """
+        only get the first depth
+        """
+        a = self.get_checkpoint_no(root_path)
+        if len(a) > 0:
+            return a[-1]
+
+        return -1
+
+    def clean_redundant_checkpoints(self, root_path, reserved=[]):
+        max_no = self._get_last_checkpoint_no(root_path)
+        if max_no < 0:
+            return
+
+        s = set(reserved)
+        if len(s) == 0:
+            s.add(max_no)
+
+        dirs = self._fs.list_dirs(root_path)
+        for d in dirs:
+            g = d.split(".")
+            if len(g) != 2:
+                continue
+
+            if g[0] != self._checkpoint_prefix:
+                continue
+
+            try:
+                n = int(g[1])
+                if n not in s:
+                    path = "{}/{}.{}".format(root_path, self._checkpoint_prefix,
+                                             n)
+                    self._fs.delete(path)
+            except Exception as e:
+                print(e)
+                continue
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
@@ -27,6 +27,7 @@ from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer

 from paddle.fluid import compiler
 from paddle.fluid.incubate.fleet.utils.fs import LocalFS
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel, CheckpointSaver

 import os
 import sys
@@ -46,21 +47,6 @@ class DistFCConfig(object):
        pass


-class TrainStatus(object):
-    def __init__(self, epoch_no=-1):
-        # completed epoch
-        self._epoch_no = epoch_no
-
-    def next(self):
-        return self._epoch_no + 1
-
-    def __eq__(self, t):
-        return self._epoch_no == t._epoch_no
-
-    def __ne__(self, t):
-        return not self == t
-
-
 class Collective(Fleet):
    def __init__(self):
        super(Collective, self).__init__(Mode.COLLECTIVE)
@@ -152,90 +138,10 @@ class Collective(Fleet):

        io.save_persistables(executor, dirname, main_program, filename=filename)

-    def _save_train_status(self, path, train_status):
-        d = {}
-        d["epoch_no"] = train_status._epoch_no
-
-        file_name = "{}/fleet_train_status".format(path)
-        with open(file_name, 'w') as f:
-            json.dump(d, f)
-
-    def _load_train_status(self, path):
-        file_name = "{}/fleet_train_status".format(path)
-
-        r = TrainStatus()
-        if not os.path.isfile(file_name):
-            return r
-
-        d = {}
-        with open(file_name, 'r') as f:
-            d = json.load(f)
-
-        assert "epoch_no" in d, "Can't find epoch_no in dict from train_status file:{}".format(
-            d)
-        r._epoch_no = d["epoch_no"]
-        assert r._epoch_no >= 0, "Data in checkpoint file is not valid:{}".format(
-            d)
-
-        return r
-
-    def _get_last_checkpoint_no(self, root_path, fs):
-        """
-        only get the first depth
-        """
-        max_no = -1
-        d = {}
-        dirs = fs.list_dirs(root_path)
-        for d in dirs:
-            g = d.split(".")
-            if len(g) != 2:
-                continue
-
-            if g[0] != "__paddle_fleet_checkpoint__":
-                continue
-
-            try:
-                n = int(g[1])
-                if n > max_no:
-                    max_no = n
-            except:
-                continue
-
-        return max_no
-
-    def clean_redundant_checkpoints(self,
-                                    root_path,
-                                    fs=LocalFS(),
-                                    checkpoint_num=1):
-        max_no = self._get_last_checkpoint_no(root_path, fs)
-        if max_no < 0:
-            return
-
-        if checkpoint_num < 1:
-            checkpoint_num = 1
-
-        dirs = fs.list_dirs(root_path)
-        for d in dirs:
-            g = d.split(".")
-            if len(g) != 2:
-                continue
-
-            if g[0] != self._checkpoint_prefix:
-                continue
-
-            try:
-                n = int(g[1])
-                if n <= max_no - checkpoint_num:
-                    path = "{}/{}.{}".format(root_path, self._checkpoint_prefix,
-                                             n)
-                    fs.delete(path)
-            except Exception as e:
-                print(e)
-                continue
-
    def save_checkpoint(self,
                        executor,
                        path,
+                        trainer_id,
                        train_status,
                        main_program=None,
                        fs=LocalFS(),
@@ -248,53 +154,25 @@ class Collective(Fleet):
        if main_program == None:
            main_program = self._transpiled_program

-        if not fs.is_exist(path):
-            fs.mkdirs(path)
-        else:
-            assert fs.is_dir(path), "path:%s must be a directory".format(path)
-
-        max_no = self._get_last_checkpoint_no(path, fs=fs)
-        if max_no < 0:
-            max_no = -1
-
-        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no + 1)
-        tmp_path = "{}.tmp".format(real_path)
-        saved_path = tmp_path
-
-        local_fs = LocalFS()
-
-        cache_path = None
-        if fs.need_upload_download():
-            cache_path = "{}/{}.{}.saved_cache".format(
-                local_cache_path, self._checkpoint_prefix, max_no + 1)
-            if not local_fs.is_exist(cache_path):
-                local_fs.mkdirs(cache_path)
-            else:
-                assert fs.is_dir(
-                    path), "cache path:{} must be a directory".format(
-                        cache_path)
-
-            saved_path = cache_path
-
-        self.save_persistables(
-            executor=executor,
-            dirname=saved_path,
-            main_program=main_program,
-            filename=self._param_file_name)
-        self._save_train_status(path=saved_path, train_status=train_status)
-
-        if fs.need_upload_download():
-            fs.delete(tmp_path)
-            fs.upload(cache_path, tmp_path)
-        fs.mv(tmp_path, real_path)
+        m = PaddleModel(executor, main_program)
+        t = train_status
+        c = CheckpointSaver(fs)
+        real_path, checkpoint_no = c.save_checkpoint(
+            path=path,
+            slists=[m, t],
+            trainer_id=trainer_id,
+            local_cache_path=local_cache_path)

        if not remain_all_checkpoint:
-            self.clean_redundant_checkpoints(path)
+            c.clean_redundant_checkpoints(path)
+
+        return real_path, checkpoint_no

    def load_checkpoint(self,
                        executor,
                        path,
                        trainer_id,
+                        train_status,
                        main_program=None,
                        fs=LocalFS(),
                        local_cache_path=".cache",
@@ -302,39 +180,17 @@ class Collective(Fleet):
        """
        This function load persistables and current epoch num from path.
        """
-        max_no = self._get_last_checkpoint_no(path, fs)
-
-        if not ignore_empty:
-            assert max_no >= 0, "Can't find checkpoint"
-
-        if max_no < 0:
-            return None
-
-        local_fs = LocalFS()
-        if fs.need_upload_download():
-            cache_path = "{}/{}.{}.load_cache.{}".format(
-                local_cache_path, self._checkpoint_prefix, max_no, trainer_id)
-            if not local_fs.is_exist(local_cache_path):
-                local_fs.mkdirs(local_cache_path)
-            if local_fs.is_exist(cache_path):
-                local_fs.delete(cache_path)
-
-        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no)
-        load_path = real_path
-        if fs.need_upload_download():
-            fs.download(real_path, cache_path)
-            load_path = cache_path

        if main_program == None:
            main_program = self._transpiled_program

-        io.load_persistables(
-            executor=executor,
-            dirname=load_path,
-            main_program=main_program,
-            filename=self._param_file_name)
-
-        return self._load_train_status(load_path)
+        m = PaddleModel(executor, main_program)
+        c = CheckpointSaver(fs)
+        return c.load_checkpoint(
+            path, [m, train_status],
+            trainer_id=trainer_id,
+            ignore_empty=ignore_empty,
+            local_cache_path=local_cache_path)


 fleet = Collective()

--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -579,7 +579,7 @@ class FleetTranspiler(Fleet):
                block.append_op(
                    type='recv_save',
                    attrs={
-                        "trainer_id": self._role_maker.worker_id(),
+                        "trainer_id": self._role_maker.worker_index(),
                        "shape": var.shape,
                        "slice_shapes":
                        [",".join([str(i) for i in var.shape])],

--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -329,7 +329,7 @@ class CompileTimeStrategy(object):

                is_distributed = True if param_name in distibuted_varnames else False

-                ctx = self.build_ctx(grad, self.grad_var_mapping, True, False,
+                ctx = self.build_ctx(grad, self.grad_var_mapping, True, True,
                                     True, is_distributed)
                send_ctx[ctx.var_name()] = ctx


--- a/python/paddle/fluid/incubate/fleet/utils/fs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fs.py
@@ -45,6 +45,10 @@ class FSTimeOut(Exception):
    pass


+class FSShellCmdAborted(ExecuteError):
+    pass
+
+
 class FS(object):
    @abc.abstractmethod
    def ls_dir(self, fs_path):
@@ -87,7 +91,7 @@ class FS(object):
        raise NotImplementedError

    @abc.abstractmethod
-    def mv(self, fs_src_path, fs_dst_path):
+    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=False):
        raise NotImplementedError

    @abc.abstractmethod
@@ -98,6 +102,10 @@ class FS(object):
    def list_dirs(self, fs_path):
        raise NotImplementedError

+    @abc.abstractmethod
+    def touch(self, fs_path, exist_ok=True):
+        raise NotImplementedError
+

 class LocalFS(FS):
    def ls_dir(self, fs_path):
@@ -138,13 +146,21 @@ class LocalFS(FS):
    def is_exist(self, fs_path):
        return os.path.exists(fs_path)

-    def touch(self, fs_path):
-        return Path(fs_path).touch()
+    def touch(self, fs_path, exist_ok=True):
+        if self.is_exist(fs_path):
+            if exist_ok:
+                return
+            raise FSFileExistsError
+
+        return Path(fs_path).touch(exist_ok=True)

-    def mv(self, src_path, dst_path):
+    def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
        if not self.is_exist(src_path):
            raise FSFileNotExistsError

+        if overwrite and self.is_exist(dst_path):
+            self.delete(dst_path)
+
        if self.is_exist(dst_path):
            raise FSFileExistsError


--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
@@ -26,8 +26,8 @@ import time
 import logging
 import six
 from . import fs
-from .fs import FS, LocalFS, FSFileExistsError, FSFileNotExistsError, ExecuteError, FSTimeOut
-import paddle.fluid as fluid
+from .fs import FS, LocalFS, FSFileExistsError, FSFileNotExistsError, ExecuteError, FSTimeOut, FSShellCmdAborted
+from paddle.fluid import core
 import functools

 from pathlib import PurePosixPath, Path
@@ -36,21 +36,39 @@ import shutil
 __all__ = ["HDFSClient"]


-def _handle_errors(f):
-    def handler(*args, **kwargs):
-        start = time.time()
-        while True:
-            try:
-                return f(*args, **kwargs)
-            except ExecuteError as e:
-                o = args[0]
+def _handle_errors(max_time_out=None):
+    def decorator(f):
+        @functools.wraps(f)
+        def handler(*args, **kwargs):
+            o = args[0]
+            time_out = max_time_out
+            if time_out is None:
                time_out = float(o._time_out) / 1000.0
-                inter = float(o._sleep_inter) / 1000.0
-                if time.time() - start >= time_out:
-                    raise FSTimeOut
-                time.sleep(inter)
+            else:
+                time_out /= 1000.0
+            inter = float(o._sleep_inter) / 1000.0
+
+            start = time.time()
+            last_print_time = start
+            while True:
+                try:
+                    return f(*args, **kwargs)
+                #important: only ExecuteError need to retry
+                except ExecuteError as e:
+                    if time.time() - start >= time_out:
+                        raise FSTimeOut("args:{} timeout:{}".format(
+                            args, time.time() - start))
+
+                    time.sleep(inter)

-    return functools.wraps(f)(handler)
+                if time.time() - last_print_time > 30:
+                    print("hadoop operator timeout:args:{} timeout:{}".format(
+                        args, time.time() - start))
+                    last_print_time = time.time()
+
+        return handler
+
+    return decorator


 class HDFSClient(FS):
@@ -72,6 +90,7 @@ class HDFSClient(FS):
        if configs:
            for k, v in six.iteritems(configs):
                config_command = '-D%s=%s' % (k, v)
+                self.pre_commands.append(config_command)

        self._time_out = time_out
        self._sleep_inter = sleep_inter
@@ -80,17 +99,22 @@ class HDFSClient(FS):
            r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:')

    def _run_cmd(self, cmd, redirect_stderr=False):
-        ret, output = fluid.core.shell_execute_cmd(cmd, 0, 0, redirect_stderr)
-        return int(ret), output.splitlines()
-
+        exe_cmd = "{} -{}".format(self._base_cmd, cmd)
+        ret, output = core.shell_execute_cmd(exe_cmd, 0, 0, redirect_stderr)
+        ret = int(ret)
+        if ret == 134:
+            raise FSShellCmdAborted(cmd)
+        return ret, output.splitlines()
+
+    @_handle_errors()
    def list_dirs(self, fs_path):
        if not self.is_exist(fs_path):
            return []

-        dirs, _ = self.ls_dir(fs_path)
+        dirs, files = self._ls_dir(fs_path)
        return dirs

-    @_handle_errors
+    @_handle_errors()
    def ls_dir(self, fs_path):
        """	
        list directory under fs_path, and only give the pure name, not include the fs_path	
@@ -98,11 +122,14 @@ class HDFSClient(FS):
        if not self.is_exist(fs_path):
            return [], []

-        cmd = "{} -ls {}".format(self._base_cmd, fs_path)
+        return self._ls_dir(fs_path)
+
+    def _ls_dir(self, fs_path):
+        cmd = "ls {}".format(fs_path)
        ret, lines = self._run_cmd(cmd)

        if ret != 0:
-            raise ExecuteError
+            raise ExecuteError(cmd)

        dirs = []
        files = []
@@ -111,9 +138,6 @@ class HDFSClient(FS):
            if len(arr) != 8:
                continue

-            if fs_path not in arr[7]:
-                continue
-
            p = PurePosixPath(arr[7])
            if arr[0][0] == 'd':
                dirs.append(p.name)
@@ -130,18 +154,20 @@ class HDFSClient(FS):

        return None

-    @_handle_errors
+    @_handle_errors()
    def is_dir(self, fs_path):
        if not self.is_exist(fs_path):
            return False

-        cmd = "{} -test -d {}".format(
-            self._base_cmd, fs_path, redirect_stderr=True)
+        return self._is_dir(fs_path)
+
+    def _is_dir(self, fs_path):
+        cmd = "test -d {}".format(fs_path, redirect_stderr=True)
        ret, lines = self._run_cmd(cmd)
        if ret:
            # other error
-            if self._test_match(lines) != None:
-                raise ExecuteError
+            if self._test_match(lines):
+                raise ExecuteError(cmd)

            return False

@@ -151,94 +177,155 @@ class HDFSClient(FS):
        if not self.is_exist(fs_path):
            return False

-        return not self.is_dir(fs_path)
+        return not self._is_dir(fs_path)

-    @_handle_errors
+    @_handle_errors()
    def is_exist(self, fs_path):
-        cmd = "{} -ls {} ".format(self._base_cmd, fs_path)
+        cmd = "ls {} ".format(fs_path)
        ret, out = self._run_cmd(cmd, redirect_stderr=True)
        if ret != 0:
            for l in out:
                if "No such file or directory" in l:
                    return False
-            raise ExecuteError
+            raise ExecuteError(cmd)

        return True

-    @_handle_errors
+    # can't retry
    def upload(self, local_path, fs_path):
        if self.is_exist(fs_path):
-            raise FSFileExistsError
+            raise FSFileExistsError("{} exists".format(fs_path))

        local = LocalFS()
        if not local.is_exist(local_path):
-            raise FSFileNotExistsError
-
-        cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
-        ret, lines = self._run_cmd(cmd)
-        if ret != 0:
-            raise ExecuteError
-
-    @_handle_errors
+            raise FSFileNotExistsError("{} not exists".format(local_path))
+
+        return self._try_upload(local_path, fs_path)
+
+    @_handle_errors()
+    def _try_upload(self, local_path, fs_path):
+        cmd = "put {} {}".format(local_path, fs_path)
+        ret = 0
+        try:
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            self.delete(fs_path)
+            raise e
+
+    # can't retry
    def download(self, fs_path, local_path):
        if self.is_exist(local_path):
-            raise FSFileExistsError
+            raise FSFileExistsError("{} exists".format(local_path))

        if not self.is_exist(fs_path):
-            raise FSFileNotExistsError
-
-        cmd = "{} -get {} {}".format(self._base_cmd, fs_path, local_path)
-        ret, lines = self._run_cmd(cmd)
-        if ret != 0:
-            raise ExecuteError
-
-    @_handle_errors
+            raise FSFileNotExistsError("{} not exits".format(fs_path))
+
+        return self._try_download(fs_path, local_path)
+
+    @_handle_errors()
+    def _try_download(self, fs_path, local_path):
+        cmd = "get {} {}".format(fs_path, local_path)
+        ret = 0
+        try:
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            local_fs = LocalFS()
+            local_fs.delete(local_path)
+            raise e
+
+    @_handle_errors()
    def mkdirs(self, fs_path):
        if self.is_exist(fs_path):
            return

-        cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
-        ret, lines = self._run_cmd(cmd)
+        out_hdfs = False
+
+        cmd = "mkdir {} ".format(fs_path)
+        ret, out = self._run_cmd(cmd, redirect_stderr=True)
        if ret != 0:
-            raise ExecuteError
+            for l in out:
+                if "No such file or directory" in l:
+                    out_hdfs = True
+                    break
+            if not out_hdfs:
+                raise ExecuteError(cmd)
+
+        if out_hdfs and not self.is_exist(fs_path):
+            cmd = "mkdir -p {}".format(fs_path)
+            ret, lines = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+
+    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
+        if overwrite and self.is_exist(fs_dst_path):
+            self.delete(fs_dst_path)

-    @_handle_errors
-    def mv(self, fs_src_path, fs_dst_path, test_exists=True):
        if test_exists:
            if not self.is_exist(fs_src_path):
-                raise FSFileNotExistsError
+                raise FSFileNotExistsError("{} is not exists".format(
+                    fs_src_path))

            if self.is_exist(fs_dst_path):
-                raise FSFileExistsError
+                raise FSFileExistsError("{} exists already".format(
+                    fs_src_path, fs_dst_path, fs_dst_path))
+
+        return self._try_mv(fs_src_path, fs_dst_path)
+
+    @_handle_errors()
+    def _try_mv(self, fs_src_path, fs_dst_path):
+        cmd = "mv {} {}".format(fs_src_path, fs_dst_path)
+        ret = 0
+        try:
+            ret, _ = self._run_cmd(cmd)
+            if ret != 0:
+                raise ExecuteError(cmd)
+        except Exception as e:
+            if not self.is_exist(fs_src_path) and \
+                    self.is_exist(fs_dst_path):
+                return
+            raise e

-        cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
-        ret, _ = self._run_cmd(cmd)
-        if ret != 0:
-            raise ExecuteError
-
-    @_handle_errors
    def _rmr(self, fs_path):
-        cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
+        cmd = "rmr {}".format(fs_path)
        ret, _ = self._run_cmd(cmd)
        if ret != 0:
-            raise ExecuteError
+            raise ExecuteError(cmd)

-    @_handle_errors
    def _rm(self, fs_path):
-        cmd = "{} -rm {}".format(self._base_cmd, fs_path)
+        cmd = "rm {}".format(fs_path)
        ret, _ = self._run_cmd(cmd)
        if ret != 0:
-            raise ExecuteError
+            raise ExecuteError(cmd)

+    @_handle_errors()
    def delete(self, fs_path):
        if not self.is_exist(fs_path):
            return

-        is_dir = self.is_dir(fs_path)
+        is_dir = self._is_dir(fs_path)
        if is_dir:
            return self._rmr(fs_path)

        return self._rm(fs_path)

+    def touch(self, fs_path, exist_ok=True):
+        if self.is_exist(fs_path):
+            if exist_ok:
+                return
+            raise FSFileExistsError
+
+        return self._touchz(fs_path)
+
+    @_handle_errors()
+    def _touchz(self, fs_path):
+        cmd = "touchz {}".format(fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
    def need_upload_download(self):
        return True
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -25,8 +25,7 @@ from ..layer_helper import LayerHelper
 from ..data_feeder import check_variable_and_dtype

 __all__ = [
-    'deprecated', 'generate_layer_fn', 'generate_activation_fn', 'autodoc',
-    'templatedoc'
+    'generate_layer_fn', 'generate_activation_fn', 'autodoc', 'templatedoc'
 ]


@@ -82,8 +81,9 @@ def _generate_doc_string_(op_proto,
    buf.write(escape_math(op_proto.comment))
    buf.write('\nArgs:\n')
    for each_input in op_proto.inputs:
-        line_begin = '    {0}: '.format(_convert_(each_input.name))
+        line_begin = '    {0}'.format(_convert_(each_input.name))
        buf.write(line_begin)
+        buf.write(" (Tensor): ")
        buf.write(escape_math(each_input.comment))
        if each_input.duplicable:
            buf.write("  Duplicatable.")
@@ -125,6 +125,8 @@ def _generate_doc_string_(op_proto,
        for each_opt in op_proto.outputs:
            if not each_opt.intermediate:
                break
+        buf.write(_convert_(each_opt.name))
+        buf.write(' (Tensor): ')
        buf.write(escape_math(each_opt.comment))

    return buf.getvalue()
@@ -275,50 +277,11 @@ def generate_activation_fn(op_type):
    func.__doc__ = _generate_doc_string_(
        op_proto,
        additional_args_lines=[
-            "name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` ."
+            "name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`."
        ])
-    func.__doc__ = func.__doc__ + """
-
-Return type
-  Variable
-
-Examples:
-    .. code-block:: python
-
-        import paddle
-        import numpy as np
-
-        paddle.enable_imperative()
-        x_data = np.array([1, 2, 3, 4]).astype(np.float32)
-        x = paddle.imperative.to_variable(x_data)
-        res = paddle.%s(x)
-        print(res.numpy())
-""" % op_type
    return func


-def deprecated(func_or_class):
-    """
-    Deprecated warning decorator. It will result a warning message.
-    Should be used before class or function, member function
-    """
-
-    @functools.wraps(func)
-    def func_wrapper(*args, **kwargs):
-        """
-        Wrap func with deprecated warning
-        """
-        warnings.simplefilter('always', DeprecationWarning)  # turn off filter
-        warnings.warn(
-            "Call to deprecated function {}.".format(func.__name__),
-            category=DeprecationWarning,
-            stacklevel=2)
-        warnings.simplefilter('default', DeprecationWarning)  # reset filter
-        return func(*args, **kwargs)
-
-    return func_wrapper
-
-
 def autodoc(comment=""):
    def __impl__(func):
        func.__doc__ = _generate_doc_string_(OpProtoHolder.instance(
@@ -384,3 +347,14 @@ def templatedoc(op_type=None):
        return func

    return __impl__
+
+
+def add_sample_code(func, sample_code):
+    """
+    Append sample code for dynamically generated functions. 
+
+    Args:
+       func: The function of the function to be append sample code to.
+       sample_code: sample code session in rst format.
+    """
+    func.__doc__ = func.__doc__ + sample_code
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6200,7 +6200,7 @@ def squeeze(input, axes, name=None):
            Out.shape = [1,3,5]

    Args:
-        input (Variable): The input Tensor. Support data type: float16, float32, float64, int8, int32, int64.
+        input (Variable): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64.
                          axes (list): One integer or List of integers, indicating the dimensions to be squeezed.
                          Axes range is :math:`[-rank(input), rank(input))`.
                          If axes is negative, :math:`axes=axes+rank(input)`.
@@ -6226,8 +6226,9 @@ def squeeze(input, axes, name=None):
    helper = LayerHelper("squeeze", **locals())
    check_variable_and_dtype(
        input, 'input',
-        ['float16', 'float32', 'float64', 'int8', 'int32', 'int64'], 'squeeze')
-    check_type(axes, 'axes', (list, tuple), 'squeeze')
+        ['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'],
+        'squeeze')
+    check_type(axes, 'axis/axes', (list, tuple), 'squeeze')
    out = helper.create_variable_for_type_inference(dtype=input.dtype)
    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
    helper.append_op(
@@ -6254,12 +6255,12 @@ def unsqueeze(input, axes, name=None):
      then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].

    Args:
-        input (Variable): The input Tensor to be unsqueezed. It is a N-D Tensor of data types float32, float64, int32.
+        input (Variable): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64.
        axes (int|list|tuple|Variable): Indicates the dimensions to be inserted. The data type is ``int32`` . If ``axes`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. If ``axes`` is an Variable, it should be an 1-D Tensor .
        name (str|None): Name for this layer.

    Returns:
-        Variable: Output unsqueezed Tensor, with data type being float32, float64, int32, int64.
+        Variable: Unsqueezed Tensor, with the same data type as input.

    Examples:
        .. code-block:: python
@@ -6269,10 +6270,15 @@ def unsqueeze(input, axes, name=None):
            y = fluid.layers.unsqueeze(input=x, axes=[1])

    """
-    if not isinstance(axes, (int, list, tuple, Variable)):
-        raise TypeError(
-            "The type of 'axes' in unsqueeze must be int, list, tuple or Variable, but "
-            "received %s." % (type(axes)))
+    if in_dygraph_mode():
+        out, _ = core.ops.unsqueeze2(input, 'axes', axes)
+        return out
+
+    check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
+    check_variable_and_dtype(
+        input, 'input',
+        ['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'],
+        'unsqueeze')
    helper = LayerHelper("unsqueeze2", **locals())
    inputs = {"X": input}
    attrs = {}
@@ -9966,7 +9972,7 @@ def stack(x, axis=0, name=None):
                                     must be the same. Supposing input is N dims
                                     Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims
                                     Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
-                                     Support data types: float32, float64, int32, int64.
+                                     Supported data types: float32, float64, int32, int64.
        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`.
                              R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`.
                              The default value of axis is 0.
@@ -11963,7 +11969,7 @@ for func in [
        ],
        skip_attrs_set={
            "x_data_format", "y_data_format", "axis", "use_quantizer",
-            "Scale_x", "Scale_y", "Scale_out"
+            "mkldnn_data_type", "Scale_x", "Scale_y", "Scale_out"
        }) + """\n""" + str(func.__doc__)

 for func in []:

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -14,7 +14,7 @@

 from __future__ import print_function
 import os
-from .layer_function_generator import generate_layer_fn, generate_activation_fn
+from .layer_function_generator import generate_layer_fn, generate_activation_fn, add_sample_code
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_, Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
@@ -61,6 +61,363 @@ __all__ += __activations_noattr__
 for _OP in set(__activations_noattr__):
    globals()[_OP] = generate_activation_fn(_OP)

+add_sample_code(globals()["sigmoid"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        import paddle.nn.functional as F
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = F.sigmoid(x)
+        print(out.numpy())
+        # [0.40131234 0.450166   0.52497919 0.57444252]
+
+""")
+
+add_sample_code(globals()["logsigmoid"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        import paddle.nn.functional as F
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = F.logsigmoid(x)
+        print(out.numpy())
+        # [-0.91301525 -0.79813887 -0.64439666 -0.55435524]
+
+""")
+
+add_sample_code(globals()["exp"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.exp(x)
+        print(out.numpy())
+        # [0.67032005 0.81873075 1.10517092 1.34985881]
+
+""")
+
+add_sample_code(globals()["tanh"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.tanh(x)
+        print(out.numpy())
+        # [-0.37994896 -0.19737532  0.09966799  0.29131261]
+
+""")
+
+add_sample_code(globals()["atan"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.atan(x)
+        print(out.numpy())
+        # [-0.38050638 -0.19739556  0.09966865  0.29145679]
+
+""")
+
+add_sample_code(globals()["tanh_shrink"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        import paddle.nn.functional as F
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = F.tanh_shrink(x)
+        print(out.numpy())
+        # [-0.02005104 -0.00262468  0.00033201  0.00868739]
+
+""")
+
+add_sample_code(globals()["sqrt"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([0.1, 0.2, 0.3, 0.4])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.sqrt(x)
+        print(out.numpy())
+        # [0.31622777 0.4472136  0.54772256 0.63245553]
+
+""")
+
+add_sample_code(globals()["rsqrt"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([0.1, 0.2, 0.3, 0.4])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.rsqrt(x)
+        print(out.numpy())
+        # [3.16227766 2.23606798 1.82574186 1.58113883]
+
+""")
+
+add_sample_code(globals()["abs"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.abs(x)
+        print(out.numpy())
+        # [0.4 0.2 0.1 0.3]
+
+""")
+
+add_sample_code(globals()["ceil"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.ceil(x)
+        print(out.numpy())
+        # [-0. -0.  1.  1.]
+
+""")
+
+add_sample_code(globals()["floor"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.floor(x)
+        print(out.numpy())
+        # [-1. -1.  0.  0.]
+
+""")
+
+add_sample_code(globals()["cos"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.cos(x)
+        print(out.numpy())
+        # [0.92106099 0.98006658 0.99500417 0.95533649]
+
+""")
+
+add_sample_code(globals()["acos"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.acos(x)
+        print(out.numpy())
+        # [1.98231317 1.77215425 1.47062891 1.26610367]
+
+""")
+
+add_sample_code(globals()["sin"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.sin(x)
+        print(out.numpy())
+        # [-0.38941834 -0.19866933  0.09983342  0.29552021]
+
+""")
+
+add_sample_code(globals()["asin"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.asin(x)
+        print(out.numpy())
+        # [-0.41151685 -0.20135792  0.10016742  0.30469265]
+
+""")
+
+add_sample_code(globals()["cosh"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.cosh(x)
+        print(out.numpy())
+        # [1.08107237 1.02006676 1.00500417 1.04533851]
+
+""")
+
+add_sample_code(globals()["sinh"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.sinh(x)
+        print(out.numpy())
+        # [-0.41075233 -0.201336    0.10016675  0.30452029]
+
+""")
+
+add_sample_code(globals()["round"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.5, -0.2, 0.6, 1.5])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.round(x)
+        print(out.numpy())
+        # [-1. -0.  1.  2.]
+
+""")
+
+add_sample_code(globals()["reciprocal"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.reciprocal(x)
+        print(out.numpy())
+        # [-2.5        -5.         10.          3.33333333]
+
+""")
+
+add_sample_code(globals()["square"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = paddle.square(x)
+        print(out.numpy())
+        # [0.16 0.04 0.01 0.09]
+
+""")
+
+add_sample_code(globals()["softplus"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        import paddle.nn.functional as F
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = F.softplus(x)
+        print(out.numpy())
+        # [0.51301525 0.59813887 0.74439666 0.85435524]
+
+""")
+
+add_sample_code(globals()["softsign"], r"""
+Examples:
+    .. code-block:: python
+
+        import numpy as np
+        import paddle
+        import paddle.nn.functional as F
+        paddle.enable_imperative()
+
+        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
+        x = paddle.imperative.to_variable(x_data)
+        out = F.softsign(x)
+        print(out.numpy())
+        # [-0.28571429 -0.16666667  0.09090909  0.23076923]
+
+""")
+
 __all__ += ['softshrink']

 _softshrink_ = generate_layer_fn('softshrink')

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -685,8 +685,9 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
    """

    attrs = {'force_cpu': force_cpu}
+    dtype = convert_dtype(dtype)
    if not isinstance(value, Variable):
-        if convert_dtype(dtype) in ['int64', 'int32']:
+        if dtype in ['int64', 'int32']:
            attrs['str_value'] = str(int(value))
        else:
            attrs['str_value'] = str(float(value))
@@ -697,7 +698,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
            out = _varbase_creator(dtype=dtype)

        if isinstance(value, Variable):
-            if convert_dtype(dtype) in ['int64', 'int32']:
+            if dtype in ['int64', 'int32']:
                attrs['str_value'] = str(int(value.numpy()))
            else:
                attrs['str_value'] = str(float(value.numpy()))
@@ -712,6 +713,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
    helper = LayerHelper("fill_constant", **locals())
    inputs = {}
    if isinstance(value, Variable):
+        if convert_dtype(value.dtype) != dtype:
+            value = cast(value, dtype)
        inputs['ValueTensor'] = value

    check_dtype(dtype, 'dtype',

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -86,6 +86,10 @@ if(WIN32)
    LIST(REMOVE_ITEM TEST_OPS test_ref_by_trainer_id_op)
 endif()

+
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
+LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
+LIST(REMOVE_ITEM TEST_OPS test_checkpoint_saver)
 if(APPLE OR WIN32)
    LIST(REMOVE_ITEM TEST_OPS test_hdfs)
    LIST(REMOVE_ITEM TEST_OPS test_fs_interface)
@@ -190,10 +194,11 @@ function(bash_test_modules TARGET_NAME)
    endif()

    set(options SERIAL)
-    set(oneValueArgs "")
-    set(multiValueArgs MODULES DEPS ENVS LABELS)
+    set(oneValueArgs TIMEOUT START_BASH)
+    set(multiValueArgs DEPS ENVS LABELS)
    cmake_parse_arguments(bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+
    set(timeout 350)
    if(${bash_test_modules_TIMEOUT})
        set(timeout ${bash_test_modules_TIMEOUT})
@@ -204,13 +209,13 @@ function(bash_test_modules TARGET_NAME)
            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${bash_test_modules_ENVS}
            WITH_COVERAGE=ON COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-            bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_MODULES}
+            bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_START_BASH}
            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    else()
        add_test(NAME ${TARGET_NAME}
            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${bash_test_modules_ENVS}
-            bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_MODULES}
+            bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_START_BASH}
            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    endif()

@@ -397,15 +402,16 @@ if(WITH_DISTRIBUTE)
    if(NOT APPLE)
        if(WITH_GPU)
            # NOTE. test_launch only work in gpu collective mode
-            bash_test_modules(test_launch MODULES test_launch.sh  ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_launch START_BASH test_launch.sh  ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
            py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint)
        endif()
-        bash_test_modules(test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_fleet_launch MODULES test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+
+        bash_test_modules(test_launch_ps START_BASH test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_fleet_launch START_BASH test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})

        set(dist_ut_port 20001)
        foreach(TEST_OP ${DIST_TEST_OPS})
-            bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
+            bash_test_modules(${TEST_OP} START_BASH dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
            MATH(EXPR dist_ut_port "${dist_ut_port}+50")
        endforeach(TEST_OP)
    endif(NOT APPLE)
@@ -441,6 +447,12 @@ if(NOT WIN32)
    set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
 endif()

+if(NOT APPLE AND NOT WIN32)
+    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 600)
+    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 600)
+    bash_test_modules(test_checkpoint_saver START_BASH dist_test.sh TIMEOUT 600)
+endif()
+
 add_subdirectory(sequence)
 add_subdirectory(dygraph_to_static)


--- a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
+++ b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.fluid.incubate.fleet.utils.fs import LocalFS
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+BATCH_NUM = 20
+BATCH_SIZE = 16
+
+#IMAGE_SIZE = 128
+CLASS_NUM = 10
+
+USE_GPU = False  # whether use GPU to run model
+places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
+
+logger = None
+
+
+def get_logger():
+    global logger
+    logger = acp._get_logger(20)
+    return logger
+
+
+def get_random_images_and_labels(image_shape, label_shape):
+    image = np.random.random(size=image_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('int64')
+    return image, label
+
+
+def sample_list_generator_creator():
+    def __reader__():
+        for _ in range(BATCH_NUM):
+            sample_list = []
+            for _ in range(BATCH_SIZE):
+                image, label = get_random_images_and_labels([16, 16], [1])
+                sample_list.append([image, label])
+
+            yield sample_list
+
+    return __reader__
+
+
+class AutoCheckpointBase(unittest.TestCase):
+    def _init_env(self,
+                  exe,
+                  main_prog,
+                  startup_prog,
+                  minimize=True,
+                  iterable=True):
+        def simple_net():
+            image = fluid.data(
+                name='image', shape=[-1, 16, 16], dtype='float32')
+            label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
+
+            fc_tmp = fluid.layers.fc(image, size=CLASS_NUM)
+            cross_entropy = fluid.layers.softmax_with_cross_entropy(fc_tmp,
+                                                                    label)
+            loss = fluid.layers.reduce_mean(cross_entropy)
+            sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            if minimize:
+                sgd.minimize(loss)
+            return sgd, loss, image, label
+
+        with program_guard(main_prog, startup_prog):
+            sgd, loss, image, label = simple_net()
+
+            if minimize:
+                compiled = fluid.CompiledProgram(main_prog).with_data_parallel(
+                    loss_name=loss.name)
+            else:
+                compiled = None
+            loader = fluid.io.DataLoader.from_generator(
+                feed_list=[image, label],
+                capacity=64,
+                use_double_buffer=True,
+                iterable=iterable)
+
+            loader.set_sample_list_generator(sample_list_generator_creator(),
+                                             places[0])
+
+        if minimize:
+            exe.run(startup_prog)
+
+        return compiled, loader, sgd, loss, image, label
+
+    def _generate(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        exe = fluid.Executor(places[0])
+
+        return exe, main_prog, startup_prog
+
+    def _reset_generator(self):
+        unique_name.generator = fluid.unique_name.UniqueNameGenerator()
+        acp.generator = fluid.unique_name.UniqueNameGenerator()
+        acp.g_acp_type = None
+        acp.g_checker = acp.AutoCheckpointChecker()
+        acp.g_program_attr = {}
+
+    def _clear_envs(self):
+        os.environ.pop("PADDLE_RUNNING_ENV", None)
+
+    def _readd_envs(self):
+        os.environ["PADDLE_RUNNING_ENV"] = "PADDLE_EDL_AUTO_CHECKPOINT"
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
@@ -51,13 +51,9 @@ def func_error_in_compile_time_2(x):
 @declarative
 def func_error_in_runtime(x, iter_num=3):
    x = fluid.dygraph.to_variable(x)
-    a = []
-    iter_num = fluid.layers.fill_constant(
-        shape=[1], value=iter_num, dtype="int32")
-    for i in range(iter_num):
-        a.append(b)
-    a = fluid.layers.concat(a, axis=0)
-    return a
+    two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32")
+    x = fluid.layers.reshape(x, shape=[1, two])
+    return x


 class TestErrorInCompileTime(unittest.TestCase):
@@ -118,7 +114,6 @@ class TestErrorInCompileTime2(TestErrorInCompileTime):
             ]


-# TODO(liym27): Consider the case that op_callstack when error raised from c++ code
 class TestErrorInRuntime(TestErrorInCompileTime):
    def set_func(self):
        self.func = func_error_in_runtime
@@ -126,10 +121,26 @@ class TestErrorInRuntime(TestErrorInCompileTime):
    def set_exception_type(self):
        self.exception_type = EnforceNotMet

-    def test(self):
-        with fluid.dygraph.guard():
-            with self.assertRaises(self.exception_type) as cm:
-                self.func(self.input)
+    def set_message(self):
+        self.expected_message = \
+            [
+                'File "{}", line 55, in func_error_in_runtime'.format(self.filepath),
+                'x = fluid.layers.reshape(x, shape=[1, two])'
+            ]
+
+    def _test_create_message(self, error_data):
+        self.filepath = inspect.getfile(unwrap(self.func))
+        self.set_message()
+
+        with self.assertRaises(ValueError):
+            error_data.create_message()
+
+        error_data.in_runtime = False
+        error_message = error_data.create_message()
+
+        self.assertIn('In user code:', error_message)
+        for m in self.expected_message:
+            self.assertIn(m, error_message)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -30,16 +30,8 @@ class TestMKLDNNReluDim2(TestRelu):

        self.attrs = {"use_mkldnn": True}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32


 class TestMKLDNNLeakyReluDim2(TestLeakyRelu):
@@ -48,16 +40,8 @@ class TestMKLDNNLeakyReluDim2(TestLeakyRelu):

        self.attrs = {"use_mkldnn": True}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32


 class TestMKLDNNGeluDim2(TestActivation):
@@ -92,16 +76,8 @@ class TestMKLDNNTanhDim2(TestTanh):

        self.attrs = {"use_mkldnn": True}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32


 class TestMKLDNNSqrtDim2(TestSqrt):
@@ -110,16 +86,8 @@ class TestMKLDNNSqrtDim2(TestSqrt):

        self.attrs = {"use_mkldnn": True}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32


 class TestMKLDNNAbsDim2(TestAbs):
@@ -127,16 +95,8 @@ class TestMKLDNNAbsDim2(TestAbs):
        super(TestMKLDNNAbsDim2, self).setUp()
        self.attrs = {"use_mkldnn": True}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32


 class TestMKLDNNSwishDim2(TestSwish):
@@ -151,15 +111,8 @@ class TestMKLDNNSwishDim2(TestSwish):
        self.outputs = {'Out': out}
        self.attrs = {"use_mkldnn": True, "beta": beta}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output()
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(['X'], 'Out')
+    def init_dtype(self):
+        self.dtype = np.float32


 class TestMKLDNNSigmoidDim2(TestSigmoid):
@@ -181,16 +134,8 @@ class TestMKLDNNReluDim4(TestRelu):
        self.outputs = {'Out': out}
        self.attrs = {"use_mkldnn": True}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32


 class TestMKLDNNLeakyReluDim4(TestLeakyRelu):
@@ -206,16 +151,8 @@ class TestMKLDNNLeakyReluDim4(TestLeakyRelu):
        self.outputs = {'Out': out}
        self.attrs = {"use_mkldnn": True}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32


 class TestMKLDNNGeluDim4(TestActivation):
@@ -254,17 +191,6 @@ class TestMKLDNNTanhDim4(TestTanh):
        self.outputs = {'Out': np.tanh(self.inputs['X'])}
        self.attrs = {"use_mkldnn": True}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
-

 class TestMKLDNNSqrtDim4(TestSqrt):
    def setUp(self):
@@ -276,17 +202,6 @@ class TestMKLDNNSqrtDim4(TestSqrt):
        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
        self.attrs = {"use_mkldnn": True}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
-

 class TestMKLDNNAbsDim4(TestAbs):
    def setUp(self):
@@ -299,23 +214,15 @@ class TestMKLDNNAbsDim4(TestAbs):
        self.outputs = {'Out': np.abs(self.inputs['X'])}
        self.attrs = {"use_mkldnn": True}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_dygraph=False)
+    def init_dtype(self):
+        self.dtype = np.float32


 class TestMKLDNNSwishDim4(TestSwish):
    def setUp(self):
        super(TestMKLDNNSwishDim4, self).setUp()

-        x = np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        x = np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype(self.dtype)
        beta = 2.3
        out = x * expit(beta * x)

@@ -323,15 +230,8 @@ class TestMKLDNNSwishDim4(TestSwish):
        self.outputs = {'Out': out}
        self.attrs = {"use_mkldnn": True, "beta": beta}

-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output()
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(['X'], 'Out')
+    def init_dtype(self):
+        self.dtype = np.float32


 class TestMKLDNNSigmoidDim4(TestSigmoid):

--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.fluid.incubate.fleet.utils.fs import LocalFS
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+
+logger = get_logger()
+
+
+class AutoCheckPointACLBase(AutoCheckpointBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def tearDown(self):
+        os.environ.clear()
+        os.environ.update(self._old_environ)
+
+    def _run_normal(self):
+        exe, main_prog, startup_prog = self._generate()
+
+        save_dir = "./run_save_model"
+        fs = LocalFS()
+
+        fs.delete(save_dir)
+        logger.info("begin _run_normal")
+
+        compiled, data_loader, optimizer, loss, image, label = self._init_env(
+            exe, main_prog, startup_prog)
+        for i in range(3):
+            self.assertEqual(acp._get_train_epoch_range(), None)
+            self.assertEqual(acp.g_acp_type, None)
+            for data in data_loader():
+                self.assertEqual(acp.g_acp_type, None)
+                self.assertEqual(acp._get_train_epoch_range(), None)
+                fetch = exe.run(compiled, feed=data, fetch_list=[loss])
+
+        self.assertEqual(acp.g_acp_type, None)
+        self.assertEqual(acp._get_train_epoch_range(), None)
+
+        m1 = PaddleModel(exe, compiled)
+        m1.serialize(save_dir)
+
+        m2 = PaddleModel(exe, compiled)
+        m2.deserialize(save_dir)
+
+        logger.info("end _run_normal")
+        fs.delete(save_dir)
+
+    def _not_use_train(self):
+        logger.info("begin _not_use_train")
+        exe, main_prog, startup_prog = self._generate()
+
+        compiled, data_loader, optimizer, loss, image, label = \
+            self._init_env(exe, main_prog, startup_prog)
+
+        epochs = []
+        for i in acp.train_epoch_range(3, 0):
+            epochs.append(i)
+            for data in data_loader():
+                fetch = exe.run(compiled, feed=data, fetch_list=[loss])
+
+        self.assertEqual(epochs, [0, 1, 2])
+        logger.info("end _not_use_train")
+
+    def _run_save_0(self, break_epoch_no=None):
+        logger.info("begin _run_save_0")
+        fs = LocalFS()
+        save_dir = "./run_save_0"
+        fs.delete(save_dir)
+
+        exe, main_prog, startup_prog = self._generate()
+
+        compiled, data_loader, optimizer, loss, image, label = \
+            self._init_env(exe, main_prog, startup_prog)
+
+        o = None
+        i = 0
+        name = None
+        for i in acp.train_epoch_range(3, 0):
+            o = acp._get_train_epoch_range()
+            name = o.name
+
+            for data in data_loader():
+                fetch = exe.run(compiled, feed=data, fetch_list=[loss])
+
+            self.assertEqual(len(o._exe_status), 1)
+
+            if break_epoch_no is not None:
+                if i == break_epoch_no:
+                    break
+
+        o = acp._get_train_epoch_range()
+        assert o == None, "now train epoch must not exits now"
+        if break_epoch_no is None:
+            self.assertEqual(i, 2)
+        else:
+            self.assertEqual(i, break_epoch_no)
+
+        fs.delete(save_dir)
+        logger.info("end _run_save_0")
+
+    def _run_load_0(self, break_epoch_no=None):
+        logger.info("begin _run_load_0")
+        exe, main_prog, startup_prog = self._generate()
+
+        fs = LocalFS()
+        save_dir = "./run_load_0"
+        fs.delete(save_dir)
+
+        compiled, data_loader, optimizer, loss, image, label = self._init_env(
+            exe, main_prog, startup_prog)
+
+        o = None
+        i = 0
+        check = False
+
+        epochs = []
+        for i in acp.train_epoch_range(3, 0):
+            epochs.append(i)
+
+            for data in data_loader():
+                fetch = exe.run(compiled, feed=data, fetch_list=[loss])
+
+        o = acp._get_train_epoch_range()
+        self.assertTrue(o == None, "now train epoch must not exits now")
+        self.assertEqual(i, 2)
+
+        if break_epoch_no is not None:
+            if break_epoch_no == 0:
+                self.assertEqual(epochs, [0, 1, 2])
+            elif break_epoch_no == 1:
+                self.assertEqual(epochs, [1, 2])
+            elif break_epoch_no == 2:
+                self.assertEqual(epochs, [2])
+        else:
+            self.assertEqual(epochs, [2])
+
+        fs.delete(save_dir)
+        logger.info("begin _run_load_0")
+
+
+class AutoCheckpointTest(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_1",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_1",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_1",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_normal(self):
+        logger.info("begin test_normal")
+        checker = acp._get_checker()
+
+        fs = HDFSClient(checker.hdfs_home, None)
+
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._clear_envs()
+        self._reset_generator()
+        self._run_normal()
+        self._readd_envs()
+        logger.info("end test_normal")
+
+    def test_basic(self):
+        logger.info("begin test_basic")
+        checker = acp._get_checker()
+        self.assertEqual(checker.run_env, "PADDLE_EDL_AUTO_CHECKPOINT")
+        self.assertEqual(checker.platform, "PADDLE_CLOUD")
+        self.assertEqual(checker.save_checkpoint_inter, 0)
+        print(checker)
+
+        fs = HDFSClient(checker.hdfs_home, None)
+
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+        self._run_save_0()
+
+        self._reset_generator()
+        self._run_load_0()
+
+        logger.info("end test_basic")
+
+    def test_not_use(self):
+        logger.info("begin test_not_use")
+
+        self._clear_envs()
+        self._reset_generator()
+        self._not_use_train()
+        self._readd_envs()
+
+        logger.info("end test_not_use")
+
+    def test_multiple(self):
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+
+        logger.info("begin test_multiple")
+        fs = LocalFS()
+        save_dir = "./run_save_0"
+        fs.delete(save_dir)
+
+        exe, main_prog1, startup_prog1 = self._generate()
+        _, main_prog2, startup_prog2 = self._generate()
+
+        compiled1, data_loader1, optimizer1, loss1, image1, label1 = \
+            self._init_env(exe, main_prog1, startup_prog1)
+
+        compiled2, data_loader2, optimizer2, loss2, image2, label2 = \
+            self._init_env(exe, main_prog2, startup_prog2)
+
+        o = None
+        epochs = []
+        for i in acp.train_epoch_range(3, 0):
+            for data in data_loader1():
+                fetch = exe.run(compiled1, feed=data, fetch_list=[loss1])
+
+            for data in data_loader2():
+                fetch = exe.run(compiled2, feed=data, fetch_list=[loss2])
+
+            o = acp._get_train_epoch_range()
+            self.assertEqual(len(o._exe_status), 2)
+            print(o._exe_status)
+            epochs.append(i)
+
+        o = acp._get_train_epoch_range()
+        self.assertTrue(o == None, "now train epoch must not exits now")
+        self.assertEqual(i, 2)
+        self.assertEqual(epochs, [0, 1, 2])
+
+        fs.delete(save_dir)
+        logger.info("end test_multiple")
+
+    def test_distributed_basic(self):
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+        fs.delete(checker.hdfs_checkpoint_path)
+        self._reset_generator()
+
+        logger.info("begin test_distributed_basic")
+        fs = LocalFS()
+        save_dir = "./run_save_0"
+        fs.delete(save_dir)
+
+        #basic
+        exe, main_prog, startup_prog = self._generate()
+
+        compiled, data_loader, optimizer, loss, image, label = \
+            self._init_env(exe, main_prog, startup_prog, minimize=False)
+
+        #fleet
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
+
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+
+        with fluid.program_guard(main_prog, startup_prog):
+            dist_optimizer = fleet.distributed_optimizer(optimizer)
+            dist_optimizer.minimize(loss)
+
+        exe.run(startup_prog)
+
+        o = None
+        i = 0
+        name = None
+        for i in acp.train_epoch_range(3, 0):
+            o = acp._get_train_epoch_range()
+            name = o.name
+            logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i))
+
+            for data in data_loader():
+                fetch = exe.run(fleet.main_program,
+                                feed=data,
+                                fetch_list=[loss])
+
+            self.assertEqual(len(o._exe_status), 1)
+
+        o = acp._get_train_epoch_range()
+        assert o == None, "now train epoch must not exits now"
+        self.assertEqual(i, 2)
+
+        fs.delete(save_dir)
+
+        logger.info("end test_distributed_basic")
+
+    def test_checker(self):
+        os.environ.pop("PADDLE_JOB_ID", None)
+        try:
+            checker = AutoCheckpointChecker()
+            self.assertFalse(True)
+        except Exception as e:
+            pass
+        os.environ["PADDLE_JOB_ID"] = "test_job_auto_1"
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+import os
+import sys
+
+from paddle.fluid.incubate.fleet.utils.fs import LocalFS
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
+from paddle.fluid.framework import program_guard
+from paddle.fluid import unique_name
+
+import numpy as np
+from paddle.io import Dataset, BatchSampler, DataLoader
+
+from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
+from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
+
+logger = get_logger()
+
+
+class AutoCheckpointTest2(AutoCheckPointACLBase):
+    def setUp(self):
+        get_logger()
+        logger.info("enter tests")
+
+        self._old_environ = dict(os.environ)
+        proc_env = {
+            "PADDLE_RUNNING_ENV": "PADDLE_EDL_AUTO_CHECKPOINT",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
+            "PADDLE_JOB_ID": "test_job_auto_2",
+            "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
+            "PADDLE_EDL_HDFS_NAME": "",
+            "PADDLE_EDL_HDFS_UGI": "",
+            "PADDLE_EDL_HDFS_CHECKPOINT_PATH": "auto_checkpoint_2",
+            "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
+            "PADDLE_EDL_FS_CACHE": ".auto_checkpoint_test_2",
+            "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0"
+        }
+        os.environ.update(proc_env)
+
+    def test_corner_epoch_no(self):
+        logger.info("begin test_corener_epoch_no")
+        checker = acp._get_checker()
+        fs = HDFSClient(checker.hdfs_home, None)
+
+        for i in range(3):
+            fs.delete(checker.hdfs_checkpoint_path)
+            self._reset_generator()
+            self._run_save_0(break_epoch_no=i)
+            self._reset_generator()
+            self._run_load_0(break_epoch_no=i)
+
+        fs.delete(checker.hdfs_checkpoint_path)
+        logger.info("end test_corener_epoch_no")
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
+from paddle.fluid.incubate.checkpoint.auto_checkpoint import ExeTrainStatus
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
+import os
+import sys
+
+from paddle.fluid.incubate.fleet.utils.fs import LocalFS
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
+
+
+class CheckpointerSaverTest(unittest.TestCase):
+    def test(self):
+        fs = HDFSClient("/usr/local/hadoop-2.7.7", None)
+        dir_path = "./checkpointsaver_test"
+        fs.delete(dir_path)
+
+        s = CheckpointSaver(fs)
+
+        fs.mkdirs("{}/exe.exe".format(dir_path))
+        fs.mkdirs("{}/exe.1".format(dir_path))
+        fs.mkdirs("{}/exe".format(dir_path))
+
+        a = s.get_checkpoint_no(dir_path)
+        self.assertEqual(len(a), 0)
+
+        fs.mkdirs("{}/__paddle_checkpoint__.0".format(dir_path))
+        fs.mkdirs("{}/__paddle_checkpoint__.exe".format(dir_path))
+
+        a = s.get_checkpoint_no(dir_path)
+        self.assertEqual(len(a), 1)
+
+        s.clean_redundant_checkpoints(dir_path)
+        s.clean_redundant_checkpoints(dir_path)
+
+        fs.delete(dir_path)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -170,7 +170,8 @@ def program_equal(a, b):
                        k))
                    return False
            assert (len(a.blocks) == len(b.blocks))
-
+        elif k == '_auto_checkpoint_name':
+            continue
        elif (v != b.__dict__[k]):
            raise ValueError("In program_equal not equal:{0}\n".format(k))


--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -15,7 +15,7 @@
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, TrainStatus
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 import inspect
@@ -38,6 +38,8 @@ class FSTest(unittest.TestCase):
                func(a)
            elif len(args) == 3:
                func(a, a)
+            elif len(args) == 5:
+                func(a, a, a, a)
            print("args:", args, len(args), "func:", func)
            self.assertFalse(True)
        except NotImplementedError as e:

--- a/python/paddle/fluid/tests/unittests/test_hdfs.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs.py
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -248,7 +248,7 @@ def zeros(shape, dtype=None, name=None):
          
          # shape is a Tensor
          shape = paddle.fill_constant(shape=[2], dtype='int32', value=2)
-          data3 = paddle.ones(shape=shape, dtype='int32') 
+          data3 = paddle.zeros(shape=shape, dtype='int32') 
          # [[0 0]
          #  [0 0]]
    """

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -178,6 +178,7 @@ packages=['paddle',
          'paddle.fluid.incubate',
          'paddle.fluid.incubate.data_generator',
          'paddle.fluid.incubate.fleet',
+          'paddle.fluid.incubate.checkpoint',
          'paddle.fluid.incubate.fleet.base',
          'paddle.fluid.incubate.fleet.parameter_server',
          'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',