Merge https://github.com/PaddlePaddle/Paddle into doublegrad

c6e0ee6d · jingqinghe · cc80d961 · c67c3916 · c6e0ee6d · c6e0ee6d
149 changed file
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -386,7 +386,7 @@ function(cc_test_run TARGET_NAME)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
    # No unit test should exceed 2 minutes.
    if (APPLE OR WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
    else()
        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
    endif()
@@ -748,7 +748,7 @@ function(py_test TARGET_NAME)
    endif()
    
    if (APPLE OR WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
    else()
        # No unit test should exceed 2 minutes in Linux.
        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -138,12 +138,17 @@ function(op_library TARGET)
    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
    file(READ ${TARGET}.cc TARGET_CONTENT)
    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
+    # [ \t\r\n]* is used for blank characters
+    string(REGEX MATCH "REGISTER_OPERATOR\\([ \t\r\n]*[a-z0-9_]*," one_register "${multi_register}")
+
    if (one_register STREQUAL "")
        string(REPLACE "_op" "" TARGET "${TARGET}")
    else ()
        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
        string(REPLACE "," "" TARGET "${TARGET}")
+        # [ \t\r\n]+ is used for blank characters.
+        # Here we use '+' instead of '*' since it is a REPLACE operation.
+        string(REGEX REPLACE "[ \t\r\n]+" "" TARGET "${TARGET}")
    endif()

    # pybind USE_NO_KERNEL_OP

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -102,6 +102,8 @@ if(WITH_MKLDNN)
    pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
    pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
+    pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
+    pass_library(cpu_bfloat16_pass inference DIR mkldnn)
    pass_library(fc_mkldnn_pass inference DIR mkldnn)
    pass_library(cpu_quantize_placement_pass base DIR mkldnn)
    pass_library(cpu_quantize_pass inference DIR mkldnn)
@@ -162,4 +164,6 @@ endif()
    cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
    cc_test(test_reshape_transpose_matmul_mkldnn_fuse_pass SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc DEPS reshape_transpose_matmul_mkldnn_fuse_pass)
    cc_test(test_matmul_transpose_reshape_fuse_pass SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc DEPS matmul_transpose_reshape_fuse_pass)
+    cc_test(test_cpu_bfloat16_placement_pass SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc DEPS cpu_bfloat16_placement_pass)
+    cc_test(test_cpu_bfloat16_pass SRCS mkldnn/cpu_bfloat16_pass_tester.cc DEPS cpu_bfloat16_pass)
 endif ()
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1892,6 +1892,82 @@ PDNode *patterns::QuantizePlacement::operator()(
  return op;
 }

+PDNode *patterns::Bfloat16Placement::operator()(
+    const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
+  std::unordered_set<std::string> supported_op_types =
+      std::unordered_set<std::string>();
+  if (!bfloat16_enabled_op_types.empty()) {
+    supported_op_types = bfloat16_enabled_op_types;
+  }
+  auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
+  return op;
+}
+
+PDNode *patterns::OrphanedBfloat16::operator()() {
+  auto *prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+  prev_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "float32";
+  });
+  auto *prev_out = pattern->NewNode(prev_out_repr())->AsOutput();
+
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+  auto *op_out = pattern->NewNode(op_out_repr())->AsOutput();
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+  next_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "float32";
+  });
+
+  prev_op->LinksTo({prev_out});
+  op->LinksFrom({prev_out}).LinksTo({op_out});
+  next_op->LinksFrom({op_out});
+  return next_op;
+}
+
+PDNode *patterns::LastBfloat16Ops::operator()() {
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+  auto *op_out = pattern->NewNode(op_out_repr())->AsOutput();
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+  next_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") !=
+           "bfloat16";
+  });
+
+  op->LinksTo({op_out});
+  next_op->LinksFrom({op_out});
+  return next_op;
+}
+
+PDNode *patterns::FirstBfloat16Ops::operator()() {
+  auto *prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+  prev_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") !=
+           "bfloat16";
+  });
+  auto *op_in = pattern->NewNode(op_in_repr())->AsOutput();
+
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+
+  prev_op->LinksTo({op_in});
+  op->LinksFrom({op_in});
+  return op;
+}
+
 PDNode *patterns::MKLDNNInPlace::operator()() {
  const std::unordered_set<std::string> &supported_op_types = {
      "abs",

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1129,6 +1129,47 @@ struct QuantizePlacement : public PatternBase {
  PATTERN_DECL_NODE(op);
 };

+struct Bfloat16Placement : public PatternBase {
+  Bfloat16Placement(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "bfloat16_placement") {}
+  PDNode* operator()(
+      const std::unordered_set<std::string>& bfloat16_enabled_op_types);
+
+  PATTERN_DECL_NODE(op);
+};
+
+struct OrphanedBfloat16 : public PatternBase {
+  OrphanedBfloat16(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "orphaned_bfloat16") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(prev_out);
+  PATTERN_DECL_NODE(op);
+  PATTERN_DECL_NODE(op_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+struct LastBfloat16Ops : public PatternBase {
+  LastBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "last_bfloat16_ops") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(op);
+  PATTERN_DECL_NODE(op_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+struct FirstBfloat16Ops : public PatternBase {
+  FirstBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "first_bfloat16_ops") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(op_in);
+  PATTERN_DECL_NODE(op);
+};
+
 // Pattern used for enforcing inplace computation for in-place computation
 // supporting DNNL ops. softmax, batch_norm and layer_norm
 struct MKLDNNInPlace : public PatternBase {

--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h"
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void UnlinkNodes(ir::Node* a, ir::Node* b) {
+  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                   a->outputs.end());
+  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                  b->inputs.end());
+}
+
+void CPUBFloat16Pass::SetInputDataType(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::FirstBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
+                                          "first_bfloat16_ops"};
+  bfloat16_ops();
+  int quantize_counter = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
+
+    if (op->Op()->Type() != "conv2d" && prev_op->Op()->Type() != "quantize") {
+      VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
+      auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
+
+      // create a quantize op node
+      OpDesc q_desc;
+      q_desc.SetType("quantize");
+      q_desc.SetInput("Input", std::vector<std::string>({op_in->Name()}));
+      q_desc.SetOutput("Output",
+                       std::vector<std::string>({quantize_out_node->Name()}));
+      q_desc.SetAttr("Scale", 1.f);
+      q_desc.SetAttr("bfloat16", true);
+      q_desc.SetAttr("output_format", Has("data_layout")
+                                          ? Get<std::string>("data_layout")
+                                          : "NCHW");
+      auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
+
+      std::string op_input_name;
+      for (auto name : op->Op()->InputNames()) {
+        for (auto input_name : op->Op()->Input(name)) {
+          if (input_name == op_in->Name()) op_input_name = name;
+        }
+      }
+
+      PADDLE_ENFORCE_NE(
+          op_input_name.empty(), true,
+          platform::errors::NotFound(
+              "Operator before operator should have input as op output"));
+
+      op->Op()->SetInput(op_input_name,
+                         std::vector<std::string>({quantize_out_node->Name()}));
+
+      UnlinkNodes(op_in, op);
+      IR_NODE_LINK_TO(op_in, quantize_op);
+      IR_NODE_LINK_TO(quantize_op, quantize_out_node);
+      IR_NODE_LINK_TO(quantize_out_node, op);
+      quantize_counter++;
+    }
+  };
+  gpd(graph, handler);
+  PrettyLogDetail("---    added %d quantize op before bfloat16 op",
+                  quantize_counter);
+}
+
+void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::LastBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
+                                         "last_bfloat16_ops"};
+  bfloat16_ops();
+  int force_fp32_counter = 0, dequantize_counter = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op_out, op_out, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, bfloat16_ops);
+
+    if ((op->Op()->HasAttr("force_fp32_output") ||
+         op->Op()->HasProtoAttr("force_fp32_output")) &&
+        !op->Op()->GetAttrIfExists<bool>("fuse_residual_connection")) {
+      op->Op()->SetAttr("force_fp32_output", true);
+      force_fp32_counter++;
+    } else if (op->Op()->Type() != "prior_box") {
+      // Create dequantize input variable
+      VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+      auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+
+      // create a dequantize op node for output.
+      OpDesc deq_desc;
+      deq_desc.SetType("dequantize");
+      deq_desc.SetInput("Input",
+                        std::vector<std::string>({dequantize_in_node->Name()}));
+      deq_desc.SetOutput("Output", std::vector<std::string>({op_out->Name()}));
+      deq_desc.SetAttr("Scale", 1.0f);
+      auto dequantize_op = g->CreateOpNode(&deq_desc);
+
+      std::string op_output_name;
+      for (auto name : op->Op()->OutputNames()) {
+        for (auto output_name : op->Op()->Output(name)) {
+          if (output_name == op_out->Name()) op_output_name = name;
+        }
+      }
+
+      PADDLE_ENFORCE_NE(
+          op_output_name.empty(), true,
+          platform::errors::NotFound(
+              "Operator after operator should have input as op output"));
+
+      op->Op()->SetOutput(op_output_name, std::vector<std::string>(
+                                              {dequantize_in_node->Name()}));
+
+      UnlinkNodes(op, op_out);
+      IR_NODE_LINK_TO(op, dequantize_in_node);
+      IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+      IR_NODE_LINK_TO(dequantize_op, op_out);
+      dequantize_counter++;
+    }
+  };
+  gpd(graph, handler);
+  PrettyLogDetail("---    added %d dequantize op and used %d force_fp32_output",
+                  dequantize_counter, force_fp32_counter);
+}
+
+void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
+  SetInputDataType(graph);
+  SetOutputDataType(graph);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_bfloat16_pass, paddle::framework::ir::CPUBFloat16Pass);
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class CPUBFloat16Pass : public Pass {
+ protected:
+  void SetInputDataType(ir::Graph* graph) const;
+  void SetOutputDataType(ir::Graph* graph) const;
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           const std::string& mkldnn_data_type = "float32",
+           const bool force_fp32_output = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+    op->SetAttr("force_fp32_output", force_fp32_output);
+  } else if (type == "pool2d" || type == "transpose2" || type == "reshape2" ||
+             type == "dropout") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "fc") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "concat") {
+    op->SetInput("X", inputs);
+    op->SetOutput("Out", outputs);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "matmul" || type == "elementwise_add") {
+    op->SetInput("X", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  }
+}
+
+void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
+                 const std::initializer_list<std::string> variable_names,
+                 int* original_nodes_num, int* current_nodes_num) {
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_pass");
+
+  graph->reset(pass->Apply(graph->release()));
+
+  *original_nodes_num = (*graph)->Nodes().size();
+  (*graph).reset(pass->Apply((*graph).release()));
+  *current_nodes_num = (*graph)->Nodes().size();
+}
+
+static const std::initializer_list<std::string> variable_names{
+    "z", "a", "b", "c", "d", "e", "f", "g", "h", "i"};
+
+ProgramDesc BuildProgramDesc(bool use_mkldnn) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dropout", "Dropout1", {"z"}, {"a"}, use_mkldnn, "float32");
+  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "pool2d", "Pool1", {"b"}, {"c"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "conv2d", "Conv1", {"c"}, {"d"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "dropout", "Dropout2", {"d"}, {"e"}, use_mkldnn, "float32");
+  SetOp(&prog, "transpose2", "Transpose1", {"e"}, {"f"}, use_mkldnn,
+        "bfloat16");
+  SetOp(&prog, "reshape2", "Reshape1", {"f"}, {"g"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "concat", "Concat1", {"g"}, {"h"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "dropout", "Dropout3", {"h"}, {"i"}, use_mkldnn, "float32");
+
+  return prog;
+}
+
+void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
+              int transpose_count, int quant_count, int dequant_count,
+              int added_nodes_count) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names, &original_nodes_num,
+              &current_nodes_num);
+
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int conv2d_nodes_count = 0;
+  int pool2d_nodes_count = 0;
+  int transpose2_nodes_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        conv2d_nodes_count++;
+      } else if (op->Type() == "pool2d") {
+        pool2d_nodes_count++;
+      } else if (op->Type() == "transpose2") {
+        transpose2_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(conv2d_nodes_count, conv_count);
+  EXPECT_EQ(pool2d_nodes_count, pool_count);
+  EXPECT_EQ(transpose2_nodes_count, transpose_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, quantize) {
+  bool use_mkldnn = true;
+  // 1 quantize + 1 dequantize
+  int added_nodes = 2;
+  MainTest(BuildProgramDesc(use_mkldnn), 2, 1, 1, 1, 2, added_nodes);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_bfloat16_pass);
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h"
+
+#include <string>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void CPUBfloat16PlacementPass::SetMkldnnDataType(
+    ir::Graph* graph, int* bfloat16_operators) const {
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("bfloat16_enabled_op_types");
+  // set mkldnn_data_type to bfloat16 to all operators that are in
+  // bfloat16_enabled_op_types vector or they are included to Bfloat16Placement
+  // pattern
+  GraphPatternDetector gpd;
+  patterns::Bfloat16Placement bfloat16_placement_pattern{gpd.mutable_pattern(),
+                                                         "bfloat16_placement"};
+  bfloat16_placement_pattern(op_types_list);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_placement_pattern);
+
+    if ((op->Op()->HasAttr("mkldnn_data_type") ||
+         op->Op()->HasProtoAttr("mkldnn_data_type")) &&
+        !platform::HasOpINT8DataType(op->Op())) {
+      op->Op()->SetAttr("mkldnn_data_type", std::string("bfloat16"));
+      (*bfloat16_operators)++;
+    }
+  };
+  gpd(graph, handler);
+}
+
+void CPUBfloat16PlacementPass::RemoveOrhanedOperators(
+    ir::Graph* graph, int* bfloat16_operators) const {
+  // find orphaned bfloat16 operator that is between two float32 operators
+  // revert mkldnn_data_type attr to float32
+  GraphPatternDetector gpd;
+  patterns::OrphanedBfloat16 orphaned_bfloat16_pattern{gpd.mutable_pattern(),
+                                                       "orphaned_bfloat16"};
+  orphaned_bfloat16_pattern();
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, orphaned_bfloat16_pattern);
+
+    op->Op()->SetAttr("mkldnn_data_type", std::string("float32"));
+    bfloat16_operators--;
+  };
+  gpd(graph, handler);
+}
+
+void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
+  int bfloat16_operators = 0;
+  SetMkldnnDataType(graph, &bfloat16_operators);
+  RemoveOrhanedOperators(graph, &bfloat16_operators);
+  PrettyLogDetail("---    marked %d operators to bfloat16 ",
+                  bfloat16_operators);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_bfloat16_placement_pass,
+              paddle::framework::ir::CPUBfloat16PlacementPass)
+    // a vector of operator type names with bfloat16 support ("conv2d" etc.)
+    // the second param is the default value for this vector
+    .DefaultPassAttr("bfloat16_enabled_op_types",
+                     new std::unordered_set<std::string>());
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+ * Specifies which operators should be run on bfloat16.
+ */
+class CPUBfloat16PlacementPass : public Pass {
+ protected:
+  void SetMkldnnDataType(ir::Graph* graph, int* bfloat16_operators) const;
+
+  void RemoveOrhanedOperators(ir::Graph* graph, int* bfloat16_operators) const;
+
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs,
+           const std::string& mkldnn_data_type = "float32") {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+  op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+
+  if (type == "conv2d") {
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  } else if (type == "concat") {
+    op->SetAttr("axis", 1);
+    op->SetInput("X", {inputs[0], inputs[1]});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+  op->SetOutput("Out", {outputs[0]});
+}
+
+// operator                      mkldnn_data_type
+// ---------------------------------------
+// (a,b)->concat->c              float32
+// c->conv->f                    float32
+// f->relu->g                    float32
+// g->pool->h                    float32
+// h->conv->k                    float32
+// k->pool->l                    float32
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "f", "g", "h", "k", "l"})) {
+    prog.MutableBlock(0)->Var(v);
+  }
+
+  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"});
+  SetOp(&prog, "conv2d", "conv1", {"c"}, {"f"});
+  SetOp(&prog, "relu", "relu1", {"f"}, {"g"});
+  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"});
+  SetOp(&prog, "conv2d", "conv2", {"h"}, {"k"});
+  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"});
+
+  return prog;
+}
+
+void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
+              unsigned expected_bfloat16_data_type_count) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
+  pass->Set("bfloat16_enabled_op_types",
+            new std::unordered_set<std::string>(bfloat16_enabled_op_types));
+
+  graph.reset(pass->Apply(graph.release()));
+
+  unsigned bfloat16_data_type_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      if (platform::HasOpBFLOAT16DataType(node->Op())) {
+        ++bfloat16_data_type_count;
+      }
+    }
+  }
+
+  EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
+}
+
+void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
+  auto prog = BuildProgramDesc();
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
+  graph.reset(pass->Apply(graph.release()));
+
+  unsigned bfloat16_data_type_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      if (platform::HasOpBFLOAT16DataType(node->Op())) {
+        ++bfloat16_data_type_count;
+      }
+    }
+  }
+  EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
+}
+
+TEST(Bfloat16PlacementPass, enable_all) {
+  MainTest({"conv2d", "pool2d", "relu", "concat"}, 6);
+}
+
+TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
+  // 2 conv2d + 2 pool2 - 1 orphaned conv2d
+  MainTest({"conv2d", "pool2d"}, 3);
+}
+
+TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(0); }
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_bfloat16_placement_pass);
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"

 namespace paddle {
 namespace framework {
@@ -145,3 +146,11 @@ void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const {

 REGISTER_PASS(transpose_flatten_concat_fuse_pass,
              paddle::framework::ir::TransposeFlattenConcatFusePass);
+REGISTER_PASS_CAPABILITY(transpose_flatten_concat_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("transpose", 0)
+            .EQ("transpose2", 0)
+            .EQ("flatten", 0)
+            .EQ("concat", 0)
+            .EQ("fusion_transpose_flatten_concat", 0));
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -69,7 +69,8 @@ class OpInfo {

  const OpCreator& Creator() const {
    PADDLE_ENFORCE_NOT_NULL(creator_,
-                            "Operator's Creator has not been registered");
+                            platform::errors::NotFound(
+                                "Operator's Creator has not been registered."));
    return creator_;
  }

@@ -79,11 +80,12 @@ class OpInfo {
    std::string type = proto_ ? proto_->type() : "unknown";
    PADDLE_ENFORCE_NOT_NULL(
        grad_op_maker_,
+        platform::errors::NotFound(
            "Operator %s's GradOpMaker has not been "
-        "registered.\nPlease check whether %s_op has "
-        "grad_op.\nIf not, please set stop_gradient to True "
+            "registered.\nPlease check whether (%s) operator has "
+            "gradient operator.\nIf not, please set stop_gradient to be True "
            "for its input and output variables using var.stop_gradient=True.",
-        type.c_str(), type.c_str());
+            type.c_str(), type.c_str()));
    return grad_op_maker_;
  }

@@ -100,11 +102,12 @@ class OpInfo {
    std::string type = proto_ ? proto_->type() : "unknown";
    PADDLE_ENFORCE_NOT_NULL(
        dygraph_grad_op_maker_,
+        platform::errors::NotFound(
            "Operator %s's DygraphGradOpMaker has not been "
-        "registered.\nPlease check whether %s_op has "
-        "grad_op.\nIf not, please set stop_gradient to True "
+            "registered.\nPlease check whether (%s) operator has "
+            "gradient operator.\nIf not, please set stop_gradient to be True "
            "for its input and output variables using var.stop_gradient=True.",
-        type.c_str(), type.c_str());
+            type.c_str(), type.c_str()));
    return dygraph_grad_op_maker_;
  }

@@ -130,14 +133,17 @@ class OpInfoMap {
  }

  void Insert(const std::string& type, const OpInfo& info) {
-    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    PADDLE_ENFORCE_NE(Has(type), true,
+                      platform::errors::AlreadyExists(
+                          "Operator (%s) has been registered.", type));
    map_.insert({type, info});
  }

  const OpInfo& Get(const std::string& type) const {
    auto op_info_ptr = GetNullable(type);
-    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered",
-                            type);
+    PADDLE_ENFORCE_NOT_NULL(
+        op_info_ptr,
+        platform::errors::NotFound("Operator (%s) is not registered.", type));
    return *op_info_ptr;
  }


--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
@@ -33,10 +33,18 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
  cur_loc += OpKernelType::kLibBits;

  int customized_value = key.customized_type_value_;
-  PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits));
+  PADDLE_ENFORCE_LT(customized_value, (1 << OpKernelType::kCustomizeBits),
+                    platform::errors::Unavailable(
+                        "Too many custom OpKernel attribute values, expected "
+                        "maximum value is %d, received value is %d.",
+                        (1 << OpKernelType::kCustomizeBits), customized_value));
  customized_value = customized_value << cur_loc;
  cur_loc += OpKernelType::kCustomizeBits;
-  PADDLE_ENFORCE(cur_loc < 64);
+  PADDLE_ENFORCE_LT(cur_loc, 64,
+                    platform::errors::Unavailable(
+                        "Too many OpKernel attribute values, expected maximum "
+                        "value is 64, received value is %d.",
+                        cur_loc));

  std::hash<int> hasher;
  return hasher(place + data_type + data_layout + library_type +

--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -43,7 +43,9 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
  std::unordered_set<std::string> names;
  auto checker = [&](const std::string& name) {
-    PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+    PADDLE_ENFORCE_EQ(
+        names.count(name), 0,
+        platform::errors::AlreadyExists("Attribute [%s] is duplicated.", name));
    names.insert(name);
  };
  for (auto& attr : proto_->attrs()) {

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -54,9 +54,10 @@ class Registrar {
 template <typename... ARGS>
 struct OperatorRegistrar : public Registrar {
  explicit OperatorRegistrar(const char* op_type) {
-    if (OpInfoMap::Instance().Has(op_type)) {
-      PADDLE_THROW("'%s' is registered more than once.", op_type);
-    }
+    PADDLE_ENFORCE_EQ(
+        OpInfoMap::Instance().Has(op_type), false,
+        platform::errors::AlreadyExists(
+            "Operator '%s' is registered more than once.", op_type));
    static_assert(sizeof...(ARGS) != 0,
                  "OperatorRegistrar should be invoked at least by OpClass");
    OpInfo info;

--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -58,7 +58,8 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
    AddInput("input", "input of cosine op").AsDuplicable();
    AddOutput("output", "output of cosine op").AsIntermediate();
    auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+      PADDLE_ENFORCE_EQ(i % 2, 0, platform::errors::InvalidArgument(
+                                      "'test_attr' must be even!"));
    };
    AddAttr<int>("test_attr", "a simple test attribute")
        .AddCustomChecker(my_checker);

--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -152,10 +152,10 @@ class OpVersionRegistrar {
    return instance;
  }
  OpVersion& Register(const std::string& op_type) {
-    if (op_version_map_.find(op_type) != op_version_map_.end()) {
-      PADDLE_THROW("'%s' is registered in operator version more than once.",
-                   op_type);
-    }
+    PADDLE_ENFORCE_EQ(
+        op_version_map_.find(op_type), op_version_map_.end(),
+        platform::errors::AlreadyExists(
+            "'%s' is registered in operator version more than once.", op_type));
    op_version_map_.insert({op_type, OpVersion()});
    return op_version_map_[op_type];
  }

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -495,9 +495,9 @@ TEST(IndicateVarDataTypeTest, other) {
    EXPECT_TRUE(
        ex_msg.find(
            "The Input Variable(Other) of "
-            "indicate_other_data_type_test Op used to "
+            "(indicate_other_data_type_test) Operator used to "
            "determine kernel data type "
-            "is empty or not LoDTensor or SelectedRows or LoDTensorArray") !=
+            "is empty or not LoDTensor or SelectedRows or LoDTensorArray.") !=
        std::string::npos);
  }
  ASSERT_TRUE(caught);

--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -20,7 +20,10 @@ namespace framework {

 void ReaderBase::ReadNext(std::vector<LoDTensor> *out) {
  std::lock_guard<std::mutex> lock(mu_);
-  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning);
+  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning,
+                    platform::errors::Unavailable(
+                        "The current reader has stopped running and cannot "
+                        "continue to read the next batch of data."));
  ReadNextImpl(out);
 }


--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -32,17 +32,21 @@ struct RWLock {
  ~RWLock() { pthread_rwlock_destroy(&lock_); }

  inline void RDLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
-                      "acquire read lock failed");
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_rdlock(&lock_), 0,
+        platform::errors::External("The pthread failed to acquire read lock."));
  }

  inline void WRLock() {
    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
-                      "acquire write lock failed");
+                      platform::errors::External(
+                          "The pthread failed to acquire write lock."));
  }

  inline void UNLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_unlock(&lock_), 0,
+        platform::errors::External("The pthread failed to unlock."));
  }

 private:

--- a/paddle/fluid/framework/save_load_util.cc
+++ b/paddle/fluid/framework/save_load_util.cc
@@ -33,7 +33,8 @@ void CheckInStreamState(std::istream& istre, size_t length) {
    VLOG(5) << "Can't read [" << length << "] from file"
            << "file seems breakem";

-    PADDLE_THROW("Model load error, file seems breaken");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Model load failed, istream state error."));
  }
 }

@@ -58,10 +59,11 @@ size_t ReadTensorNumber(std::istream& istre) {
             sizeof(char) * tensor_number_mark.size());
  std::string str_read_tensor_number_mark(tensor_number_mark_buffer,
                                          tensor_number_mark.size());
-  PADDLE_ENFORCE_EQ(
-      tensor_number_mark, str_read_tensor_number_mark,
-      "Tensor number mark not match, expect [%s], but read from file is [%]",
-      tensor_number_mark, str_read_tensor_number_mark);
+  PADDLE_ENFORCE_EQ(tensor_number_mark, str_read_tensor_number_mark,
+                    platform::errors::InvalidArgument(
+                        "Tensor number mark does not match, expect mark is "
+                        "[%s], but the mark read from file is [%s].",
+                        tensor_number_mark, str_read_tensor_number_mark));

  size_t tensor_number = 0;
  istre.read(reinterpret_cast<char*>(&tensor_number), sizeof(tensor_number));
@@ -79,10 +81,11 @@ std::string ReadTensorName(std::istream& istre) {

  std::string str_read_tensor_name_mark(name_mark_buffer,
                                        tensor_name_mark.size());
-  PADDLE_ENFORCE_EQ(
-      tensor_name_mark, str_read_tensor_name_mark,
-      "Tensor name mark not match, expect [%s], but read from file is [%]",
-      tensor_name_mark, str_read_tensor_name_mark);
+  PADDLE_ENFORCE_EQ(tensor_name_mark, str_read_tensor_name_mark,
+                    platform::errors::InvalidArgument(
+                        "Tensor name mark does not match, expect mark is [%s], "
+                        "but the mark read from file is [%s].",
+                        tensor_name_mark, str_read_tensor_name_mark));

  size_t tensor_name_length = 0;
  istre.read(reinterpret_cast<char*>(&tensor_name_length),
@@ -117,16 +120,18 @@ bool SaveStaticNameListToDisk(

  for (size_t i = 0; i < vec_tensor_name_list.size(); ++i) {
    auto var_ptr = scope.FindVar(vec_tensor_name_list[i]);
-    PADDLE_ENFORCE_NE(
-        var_ptr, nullptr,
-        "Variable find error, when save model, can't not find vairable [%s], "
-        "Please make sure you have run StartUpProgram",
-        vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        var_ptr, platform::errors::NotFound("Variable (%s) is not found when "
+                                            "saving model, please make sure "
+                                            "that exe.run(startup_program) has "
+                                            "been executed.",
+                                            vec_tensor_name_list[i]));
    Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
    PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed,"
-                      "Please make sure you have run StartUpProgram",
-                      vec_tensor_name_list[i]);
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, please make sure "
+                          "that exe.run(startup_program) has been executed.",
+                          vec_tensor_name_list[i]));

    map_tensor[vec_tensor_name_list[i]] = tensor;
  }
@@ -145,9 +150,10 @@ bool SaveDygraphVarBaseListToDisk(
    Tensor* tensor = var_ptr->GetMutable<LoDTensor>();

    PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed,"
-                      "Please make sure you have run StartUpProgram",
-                      vec_var_base_list[i]->Name());
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, please make sure "
+                          "that exe.run(startup_program) has been executed.",
+                          vec_var_base_list[i]->Name()));

    map_tensor[vec_var_base_list[i]->Name()] = tensor;
  }
@@ -185,34 +191,41 @@ bool LoadStaticNameListFromDisk(

  for (size_t i = 0; i < vec_tensor_name_list.size(); ++i) {
    auto it = map_load_tensor.find(vec_tensor_name_list[i]);
-    PADDLE_ENFORCE(it != map_load_tensor.end(),
-                   "Paramete not found in Model file, "
-                   "Can not find [%s] in model file [%s]",
-                   vec_tensor_name_list[i], file_name);
+    PADDLE_ENFORCE_NE(it, map_load_tensor.end(),
+                      platform::errors::NotFound(
+                          "Parameter (%s) not found in model file (%s).",
+                          vec_tensor_name_list[i], file_name));
    auto var_ptr = scope.FindVar(vec_tensor_name_list[i]);

-    PADDLE_ENFORCE_NE(
-        var_ptr, nullptr,
-        "Parameter not created, when load model, can't not find parameter [%s] "
-        "please make sure you have run StartUpProgram",
-        vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        var_ptr,
+        platform::errors::PreconditionNotMet(
+            "Parameter (%s) is not created when loading model, "
+            "please make sure that exe.run(startup_program) has been executed.",
+            vec_tensor_name_list[i]));

    Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NE(tensor, nullptr,
-                      "Paramter [%s] not initialzed "
-                      "please make sure you have run startUpProgram",
-                      vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        tensor,
+        platform::errors::PreconditionNotMet(
+            "Paramter [%s] is not initialzed, "
+            "please make sure that exe.run(startup_program) has been executed.",
+            vec_tensor_name_list[i]));

    PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed "
-                      "please make sure you have run StartUpProgram",
-                      vec_tensor_name_list[i]);
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, "
+                          "please make sure that exe.run(startup_program) has "
+                          "been executed.v",
+                          vec_tensor_name_list[i]));
    PADDLE_ENFORCE_EQ(
        tensor->dims(), it->second->dims(),
-        "Shape not matching: the Program requires a parameter with a shape of "
-        "(%s), "
-        "while the loaded parameter (namely [ %s ]) has a shape of  (%s).",
-        tensor->dims(), vec_tensor_name_list[i], it->second->dims());
+        platform::errors::InvalidArgument(
+            "Shape does not match, the program requires a parameter with a "
+            "shape of "
+            "(%s), while the loaded parameter (namely [ %s ]) has a shape of "
+            "(%s).",
+            tensor->dims(), vec_tensor_name_list[i], it->second->dims()));

    TensorCopySync(*(it->second.get()), tensor->place(), tensor);

@@ -239,9 +252,9 @@ bool SaveTensorToDisk(const std::string& file_name,
  MkDirRecursively(DirName(file_name).c_str());

  std::ofstream fout(file_name, std::ios::binary);
-  if (!fout) {
-    PADDLE_THROW("File open error. Can not open file [%s]", file_name);
-  }
+  PADDLE_ENFORCE_EQ(
+      fout.is_open(), true,
+      platform::errors::Unavailable("File (%s) open failed.", file_name));

  // first 256 byte for reserve for fulture upgrade
  char* kReserveBuffer = new char[model_file_reserve_size];
@@ -292,9 +305,8 @@ bool SaveTensorToDisk(const std::string& file_name,
      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
      data_ptr = temp.data<void>();
 #else
-      PADDLE_THROW(
-          "Tensor is in CUDA device, but paddle not compile with CUDA, this "
-          "should not happen");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Tensor is in CUDA device, but paddle not compiled with CUDA."));
 #endif
    }
    fout.write(static_cast<const char*>(data_ptr),
@@ -302,8 +314,9 @@ bool SaveTensorToDisk(const std::string& file_name,
  }

  if (!fout) {
-    PADDLE_THROW("Model save failed, data write to model file [%s] error",
-                 file_name);
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Model save failed, error when writing data into model file [%s].",
+        file_name));
  }

  fout.close();
@@ -316,9 +329,9 @@ bool LoadTensorFromDisk(
    std::map<std::string, std::shared_ptr<Tensor>>* map_tensor) {
  std::ifstream fin(file_name, std::ios::binary);

-  if (!fin) {
-    PADDLE_THROW("File open error. Can not open model file [%s]", file_name);
-  }
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("File (%s) open failed.", file_name));

  ReadReserveBuffer(fin);

@@ -331,7 +344,8 @@ bool LoadTensorFromDisk(
    uint32_t version;
    fin.read(reinterpret_cast<char*>(&version), sizeof(version));
    CheckInStreamState(fin, sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    PADDLE_ENFORCE_EQ(version, 0U, platform::errors::InvalidArgument(
+                                       "Only version 0 tensor is supported."));
    proto::VarType::TensorDesc desc;
    {
      // int32_t size
@@ -344,7 +358,7 @@ bool LoadTensorFromDisk(
      CheckInStreamState(fin, sizeof(size));
      PADDLE_ENFORCE_EQ(
          desc.ParseFromArray(buf.get(), size), true,
-          platform::errors::InvalidArgument("Cannot parse tensor desc"));
+          platform::errors::InvalidArgument("Parse tensor desc failed."));
    }

    {  // read tensor

--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -113,7 +113,9 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
    // the 1st field, unit32_t version for SelectedRows
    uint32_t version;
    is.read(reinterpret_cast<char*>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    PADDLE_ENFORCE_EQ(version, 0U,
+                      platform::errors::InvalidArgument(
+                          "Only version 0 SelectedRows is supported."));
  }
  {
    // the 2st field, rows information
@@ -155,24 +157,27 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown,
  auto iter = id_to_index_.find(key);
  if (iter == id_to_index_.end()) {
    rwlock_->UNLock();
-    if (!auto_grown) {
-      PADDLE_THROW("key %d not found", key);
-    }
+    PADDLE_ENFORCE_EQ(
+        auto_grown, true,
+        platform::errors::NotFound("Input key(%lld) is not found.", key));
    rwlock_->WRLock();
    auto map_size = id_to_index_.size();
    auto vector_size = rows_.size();
    if (map_size != vector_size) {
      rwlock_->UNLock();
-      PADDLE_THROW(
-          "id_to_index_ size %d should have the same size with rows_ %d",
-          map_size, vector_size);
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Row map size(%zu) should be equal to rows size(%zu).", map_size,
+          vector_size));
    }
    auto write_iter = id_to_index_.find(key);
    if (write_iter == id_to_index_.end()) {
      int row_num = rows_.size();
      if (row_num == value_->dims()[0]) {
        rwlock_->UNLock();
-        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Selected rows is full, then length exceed the length of first "
+            "dimension (%d).",
+            row_num));
      }
      // key logic to put a key into id_to_index_
      rows_.push_back(key);
@@ -203,15 +208,20 @@ void SelectedRows::SyncIndex() {

 void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
                       bool auto_grown, bool is_test) {
-  PADDLE_ENFORCE(value->IsInitialized(),
-                 "The value tensor should be initialized.");
+  PADDLE_ENFORCE_EQ(value->IsInitialized(), true,
+                    platform::errors::InvalidArgument(
+                        "The value tensor is not initialized."));
  if (ids.numel() == 0) {
    VLOG(3) << "keys is empty, please check data!";
  } else {
    int64_t value_width = value_->numel() / value_->dims()[0];
-    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
-                      "output tensor should have the same shape with table "
-                      "except the dims[0].");
+    PADDLE_ENFORCE_EQ(
+        value_width, value->numel() / value->dims()[0],
+        platform::errors::InvalidArgument(
+            "Output tensor should have the same shape with table "
+            "except the first dimmension, excepted value width not counting "
+            "the first dimension is %d, actual value width is %d.",
+            value_width, value->numel() / value->dims()[0]));
    for (int i = 0; i < ids.numel(); ++i) {
      auto id = ids.data<int64_t>()[i];
      int64_t index = AutoGrownIndex(id, auto_grown, is_test);

--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -82,7 +82,8 @@ class SelectedRows {
  int64_t Index(int64_t key) const {
    auto it = std::find(rows_.begin(), rows_.end(), key);
    if (it == rows_.end()) {
-      PADDLE_THROW("id %s not in table", key);
+      PADDLE_THROW(platform::errors::NotFound(
+          "Input id (%lld) is not in current rows table.", key));
    }
    return static_cast<int64_t>(std::distance(rows_.begin(), it));
  }

--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -25,20 +25,22 @@ namespace framework {
 std::vector<DDim> InferShapeContext::GetReaderDims(
    const std::string &name) const {
  const std::vector<std::string> &arg_names = Inputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader input '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "Reader input '%s' should hold one element, but now it "
+                        "holds %d elements.",
+                        name, arg_names.size()));
  return this->GetRepeatedDims(arg_names[0]);
 }

 void InferShapeContext::SetReaderDims(const std::string &name,
                                      const std::vector<DDim> &dims) {
  const std::vector<std::string> &arg_names = Outputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader output '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "Reader output '%s' should hold one element, but now "
+                        "it holds %d elements.",
+                        name, arg_names.size()));
  return this->SetRepeatedDims(arg_names[0], dims);
 }


--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -94,9 +94,17 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
    auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
    auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place,
+                      platform::errors::Unavailable(
+                          "Source place and context place do not match, source "
+                          "place is %s, context place is %s.",
+                          src_gpu_place, ctx_gpu_place));
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
@@ -106,9 +114,17 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
    auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place,
+                      platform::errors::Unavailable(
+                          "Destination place and context place do not match, "
+                          "destination place is %s, context place is %s.",
+                          dst_gpu_place, ctx_gpu_place));
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
@@ -164,7 +180,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    if (platform::is_same_place(src_place, dst_place)) {
@@ -180,12 +200,14 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                     stream);
      } else {
-        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Context place dose not match the source and destination place."));
      }
    }
  }
  else {  // NOLINT
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copying from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
 }
@@ -298,7 +320,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                 nullptr);
  }
  else {  // NOLINT
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
 }
@@ -832,7 +855,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
 void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst,
                            const platform::Place& dst_place) {
  // vector types not currently supported
-  PADDLE_ENFORCE_LE(type.lanes, 1, "vector types not currently supported");
+  PADDLE_ENFORCE_LE(type.lanes, 1,
+                    platform::errors::Unimplemented(
+                        "Vector type is not supported currently."));

  switch (type.bits) {
    case 8:
@@ -840,32 +865,37 @@ void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst,
        return static_cast<void*>(dst->mutable_data<int8_t>(dst_place));
      if (type.code == kDLUInt)
        return static_cast<void*>(dst->mutable_data<uint8_t>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
    case 16:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int16_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::float16>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
    case 32:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int32_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(dst->mutable_data<float>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
    case 64:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int64_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(dst->mutable_data<double>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
    default:
-      PADDLE_THROW("Unsupport type.bits %d", type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported DLDataType.bits %d.", type.bits));
  }
 }


--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -183,7 +183,11 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
  dst->resize(src.numel());
  auto dst_ptr = static_cast<void*>(dst->data());

-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(src.place()), true);
+  PADDLE_ENFORCE_EQ(
+      platform::is_cpu_place(src.place()), true,
+      platform::errors::InvalidArgument(
+          "The input tensor should be CPU device, but actually it is in %s.",
+          src.place()));

  memory::Copy(dst_place, dst_ptr,
               BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -27,8 +27,9 @@ Analyzer::Analyzer() {}
 void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }

 void Analyzer::RunAnalysis(Argument *argument) {
-  PADDLE_ENFORCE(argument->analysis_passes_valid(),
-                 "analsis_passes is not valid in the argument.");
+  PADDLE_ENFORCE_EQ(argument->analysis_passes_valid(), true,
+                    platform::errors::InvalidArgument(
+                        "analsis_passes is not valid in the argument."));
  const bool disable_logs = argument->disable_logs();
  for (auto &pass : argument->analysis_passes()) {
    if (!disable_logs) {
@@ -38,7 +39,8 @@ void Analyzer::RunAnalysis(Argument *argument) {
      continue;

    auto *ptr = PassRegistry::Global().Retreive(pass);
-    PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
+    PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::PreconditionNotMet(
+                                     "no analysis pass called %s", pass));
    ptr->Run(argument);
  }
 }

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -75,9 +75,14 @@ void TestWord2vecPrediction(const std::string& model_path) {
  std::vector<PaddleTensor> outputs;
  CHECK(predictor->Run(slots, &outputs));

-  PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+  PADDLE_ENFORCE_EQ(outputs.size(), 1UL,
+                    platform::errors::PreconditionNotMet(
+                        "Output size should be 1, but got %d", outputs.size()));
  // Check the output buffer size and result of each tid.
-  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL);
+  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL,
+                    platform::errors::PreconditionNotMet(
+                        "Output's data length should be 33168 but got %d",
+                        outputs.front().data.length()));
  float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                     0.000932706};
  const size_t num_elements = outputs.front().data.length() / sizeof(float);

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -79,7 +79,9 @@ struct Argument {
 #define DECL_ARGUMENT_FIELD(field__, Field, type__)                      \
 public:                                                                 \
  type__& field__() {                                                    \
-    PADDLE_ENFORCE(Has(#field__), "There is no such field"); \
+    PADDLE_ENFORCE_EQ(                                                   \
+        Has(#field__), true,                                             \
+        platform::errors::PreconditionNotMet("There is no such field")); \
    return field__##_;                                                   \
  }                                                                      \
  void Set##Field(const type__& x) {                                     \
@@ -98,8 +100,11 @@ struct Argument {
 #define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__)                    \
 public:                                                                      \
  type__& field__() {                                                         \
-    PADDLE_ENFORCE_NOT_NULL(field__##_);                                  \
-    PADDLE_ENFORCE(Has(#field__));                                        \
+    PADDLE_ENFORCE_NOT_NULL(field__##_, platform::errors::PreconditionNotMet( \
+                                            "filed should not be null."));    \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
    return *static_cast<type__*>(field__##_.get());                           \
  }                                                                           \
  void Set##Field(type__* x) {                                                \
@@ -113,11 +118,15 @@ struct Argument {
  }                                                                           \
  DECL_ARGUMENT_FIELD_VALID(field__);                                         \
  type__* field__##_ptr() {                                                   \
-    PADDLE_ENFORCE(Has(#field__));                                        \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
    return static_cast<type__*>(field__##_.get());                            \
  }                                                                           \
  type__* Release##Field() {                                                  \
-    PADDLE_ENFORCE(Has(#field__));                                        \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
    valid_fields_.erase(#field__);                                            \
    return static_cast<type__*>(field__##_.release());                        \
  }                                                                           \
@@ -227,8 +236,10 @@ struct Argument {
 };

 #define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \
-  PADDLE_ENFORCE(argument__->Has(#fieldname__),       \
-                 "the argument field [%s] should be set", #fieldname__);
+  PADDLE_ENFORCE_EQ(                                  \
+      argument__->Has(#fieldname__), true,            \
+      platform::errors::PreconditionNotMet(           \
+          "the argument field [%s] should be set", #fieldname__));

 }  // namespace analysis
 }  // namespace inference

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -73,12 +73,15 @@ struct DataTypeNamer {
  template <typename T>
  const std::string &repr() const {
    auto x = std::type_index(typeid(T));
-    PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
+    PADDLE_ENFORCE_GT(dic_.count(x), 0, platform::errors::PreconditionNotMet(
+                                            "unknown type for representation"));
    return dic_.at(x);
  }

  const std::string &repr(const std::type_index &type) const {  // NOLINT
-    PADDLE_ENFORCE(dic_.count(type), "unknown type for representation");
+    PADDLE_ENFORCE_GT(dic_.count(type), 0,
+                      platform::errors::PreconditionNotMet(
+                          "unknown type for representation"));
    return dic_.at(type);
  }

@@ -116,7 +119,9 @@ template <typename T>
 class OrderedRegistry {
 public:
  T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
+    PADDLE_ENFORCE_EQ(dic_.count(name), 0,
+                      platform::errors::PreconditionNotMet(
+                          "There exists duplicate key [%s]", name));
    dic_[name] = elements_.size();
    elements_.emplace_back(std::unique_ptr<T>(x));
    return elements_.back().get();
@@ -136,14 +141,20 @@ class OrderedRegistry {
 template <typename T>
 T &GetFromScope(const framework::Scope &scope, const std::string &name) {
  framework::Variable *var = scope.FindVar(name);
-  PADDLE_ENFORCE(var != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "The var which name is %s should not be nullptr.", name));
  return *var->GetMutable<T>();
 }

 static framework::proto::ProgramDesc LoadProgramDesc(
    const std::string &model_path) {
  std::ifstream fin(model_path, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::NotFound(
+          "Cannot open file %s, please confirm whether the file exists",
+          model_path));
  fin.seekg(0, std::ios::end);
  std::string buffer(fin.tellg(), ' ');
  fin.seekg(0, std::ios::beg);
@@ -188,10 +199,12 @@ static std::string GetDirRoot(const std::string &path) {
 static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) {
  std::string opt_cache_dir = model_root + "/_opt_cache/";
  if (!PathExists(opt_cache_dir)) {
-    PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1,
+    PADDLE_ENFORCE_NE(
+        MKDIR(opt_cache_dir.c_str()), -1,
+        platform::errors::PreconditionNotMet(
            "Can not create optimize cache directory: %s, Make sure you "
            "have permission to write",
-                   opt_cache_dir);
+            opt_cache_dir));
  }
  return opt_cache_dir;
 }

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -38,7 +38,9 @@ IRPassManager::IRPassManager(Argument *argument) {
  graph_ = std::unique_ptr<Graph>(new Graph(argument->main_program()));
  if (argument->Has("scope")) {
    auto *scope_ptr = argument->scope_ptr();
-    PADDLE_ENFORCE(scope_ptr);
+    PADDLE_ENFORCE_NOT_NULL(scope_ptr,
+                            platform::errors::PreconditionNotMet(
+                                "The scope ptr should not be nullptr."));
    graph_->SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
  }

@@ -101,13 +103,17 @@ void IRPassManager::CreatePasses(Argument *argument,
      std::string optim_cache_dir = argument->optim_cache_dir();
      bool int8_valid =
          !(model_from_memory && optim_cache_dir.empty() && enable_int8);
-      PADDLE_ENFORCE(int8_valid,
+      PADDLE_ENFORCE_EQ(
+          int8_valid, true,
+          platform::errors::PreconditionNotMet(
              "When you are in TRT INT8 mode, and load model from "
              "memory, you should set optim_cache_dir using "
-                     "config.SetOptimCacheDir()");
-      PADDLE_ENFORCE(!(model_from_memory && use_static_engine),
+              "config.SetOptimCacheDir()"));
+      PADDLE_ENFORCE_EQ(
+          !(model_from_memory && use_static_engine), true,
+          platform::errors::PreconditionNotMet(
              "When you are using Paddle-TRT, and also using load model "
-                     "from memory, you should set the use_static to false.");
+              "from memory, you should set the use_static to false."));

      if (!optim_cache_dir.empty()) {
        pass->Set("model_opt_cache_dir", new std::string(optim_cache_dir));

--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -123,7 +123,9 @@ void RenameAndGetOutputs(
  auto add_block_var = [&](const std::string &graph_arg,
                           const std::string &block_arg) {
    auto arg_var_node = graph_var_map.find(graph_arg);
-    PADDLE_ENFORCE(arg_var_node != graph_var_map.end());
+    PADDLE_ENFORCE_NE(arg_var_node, graph_var_map.end(),
+                      platform::errors::InvalidArgument(
+                          "Can not find %s in graph_var_map", graph_arg));
    auto *var_t = block_desc->Var(block_arg);
    var_t->SetShape(arg_var_node->second->Var()->GetShape());
    var_t->SetDataType(arg_var_node->second->Var()->GetDataType());
@@ -133,7 +135,10 @@ void RenameAndGetOutputs(
    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
    framework::OpDesc op_desc(*op, nullptr);
    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type(),
+                      platform::errors::PreconditionNotMet(
+                          "We should get %s, but get %s", op->type(),
+                          correspond_node->Name()));

    std::unordered_map<std::string, size_t> var2id;
    std::unordered_map<std::string, framework::ir::Node *> in_vars;

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -97,7 +97,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
    std::vector<std::string> *repetitive_params) const {
  auto *op_desc = node->Op();
  auto &subgraph = *framework::ir::Agent(node).subgraph();
-  PADDLE_ENFORCE(!subgraph.empty());
+  PADDLE_ENFORCE_EQ(subgraph.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The subgraph should not be empty."));

  framework::ProgramDesc *program_desc =
      Get<framework::ProgramDesc *>("program");
@@ -194,12 +196,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  // to Tensor.
  std::vector<std::string> output_mapping;
  for (auto name : output_names) {
-    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    PADDLE_ENFORCE_NE(output_name_map.count(name), 0,
+                      platform::errors::PreconditionNotMet(
+                          "The output_name_map should have %s", name));
    output_mapping.push_back(output_name_map[name]);
  }
-  PADDLE_ENFORCE(!output_mapping.empty());
-  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
-                 "the block has no var-desc");
+  PADDLE_ENFORCE_EQ(output_mapping.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The output_mapping should not be empty."));
+  PADDLE_ENFORCE_EQ(
+      !block_desc.Proto()->vars().empty(), true,
+      platform::errors::PreconditionNotMet("the block has no var-desc"));

  // Set attrs
  op_desc->SetType("tensorrt_engine");

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -13,6 +13,8 @@
 // limitations under the License.

 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include <memory>
+#include <utility>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"

@@ -31,7 +33,10 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
  // Apply passes.
  IRPassManager the_ir_manager(argument);
  graph = the_ir_manager.Apply(std::move(graph));
-  PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
+  PADDLE_ENFORCE_GT(
+      graph->Nodes().size(), 0,
+      platform::errors::PreconditionNotMet(
+          "The graph nodes size should be greater than 0, but got 0"));
  argument->SetMainGraph(graph.release());
  CollectFusionStatis(argument);
 }

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -31,7 +31,9 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
  if (!argument->scope_valid()) {
    argument->SetScope(new framework::Scope);
  }
-  PADDLE_ENFORCE(argument->use_gpu_valid());
+  PADDLE_ENFORCE_EQ(argument->use_gpu_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The use_gpu field should be valid"));

  // The load program should run on the same device with the inference program,
  // so that the parameters will on the same device, or they will keep copying
@@ -51,14 +53,17 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
        argument->model_from_memory_valid() && argument->model_from_memory());
    argument->SetMainProgram(program.release());
  } else {
-    PADDLE_THROW(
-        "either model_dir or (program path and parameter path) should be set.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "either model_dir or (program path and parameter path) should be "
+        "set."));
  }

  auto graph = std::unique_ptr<Graph>(new Graph(argument->main_program()));
  argument->SetMainGraph(graph.release());
  auto *scope_ptr = argument->scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
+  PADDLE_ENFORCE_NOT_NULL(scope_ptr,
+                          platform::errors::PreconditionNotMet(
+                              "The scope ptr should not be nullptr."));
  argument->main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
 }


--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
@@ -31,7 +31,8 @@ void IrInferCleanGraphPass::RunImpl(Argument* argument) {
  std::unordered_set<const framework::ir::Node*> invalid_nodes;
  int valid_op = 0;
  for (auto* node : graph.Nodes()) {
-    PADDLE_ENFORCE_NOT_NULL(node);
+    PADDLE_ENFORCE_NOT_NULL(node, platform::errors::PreconditionNotMet(
+                                      "The node should not be nullptr."));
    if (is_valid_node(node)) {
      invalid_nodes.insert(node);
    } else if (node->IsOp()) {

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -23,8 +23,12 @@ namespace inference {
 namespace analysis {

 void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
-  PADDLE_ENFORCE(argument->scope_valid());
-  PADDLE_ENFORCE(argument->use_gpu_valid());
+  PADDLE_ENFORCE_EQ(
+      argument->scope_valid(), true,
+      platform::errors::PreconditionNotMet("The scope field should be valid"));
+  PADDLE_ENFORCE_EQ(argument->use_gpu_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The use_gpu field should be valid"));

  platform::Place place;

@@ -40,7 +44,9 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {

  LOG(INFO) << "Sync params from CPU to GPU";

-  PADDLE_ENFORCE(argument->gpu_device_id_valid());
+  PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The gpu_device_id field should be valid"));
  place = platform::CUDAPlace(argument->gpu_device_id());

  auto *scope = argument->scope_ptr();
@@ -56,7 +62,8 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
      continue;
    }
    auto *var = scope->FindLocalVar(var_name);
-    PADDLE_ENFORCE(var != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
+                                     "The var should not be nullptr"));
    if (var->IsType<framework::LoDTensor>() ||
        var->IsType<framework::Tensor>()) {
      auto *t = var->GetMutable<framework::LoDTensor>();

--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -224,7 +224,9 @@ void UpdateOpDescsByReuse(

      // modify the graph
      for (auto input_node : node->inputs) {
-        PADDLE_ENFORCE(input_node->IsVar());
+        PADDLE_ENFORCE_EQ(input_node->IsVar(), true,
+                          platform::errors::PreconditionNotMet(
+                              "The input node should be a variable."));
        std::string input_node_name = input_node->Name();
        if (reuse_table.count(input_node_name) &&
            reuse_table.at(input_node_name) != input_node_name) {
@@ -246,7 +248,9 @@ void UpdateOpDescsByReuse(

      // modify the graph
      for (auto out_node : node->outputs) {
-        PADDLE_ENFORCE(out_node->IsVar());
+        PADDLE_ENFORCE_EQ(out_node->IsVar(), true,
+                          platform::errors::PreconditionNotMet(
+                              "The output node should be a variable."));
        std::string out_node_name = out_node->Name();
        if (reuse_table.count(out_node_name) &&
            reuse_table.at(out_node_name) != out_node_name) {

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -230,7 +230,8 @@ void AnalysisConfig::EnableMkldnnBfloat16() {

 MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
  PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
-                          "MkldnnQuantizer was not enabled yet.");
+                          platform::errors::PreconditionNotMet(
+                              "MkldnnQuantizer was not enabled yet."));
  return mkldnn_quantizer_config_.get();
 }


--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -169,7 +169,8 @@ bool AnalysisPredictor::PrepareScope(
  if (parent_scope) {
    PADDLE_ENFORCE_NOT_NULL(
        parent_scope,
-        "Both program and parent_scope should be set in Clone mode.");
+        platform::errors::PreconditionNotMet(
+            "Both program and parent_scope should be set in Clone mode."));
    scope_ = parent_scope;
    status_is_cloned_ = true;
  } else {
@@ -235,7 +236,9 @@ bool AnalysisPredictor::PrepareExecutor() {
  executor_->Prepare(sub_scope_, *inference_program_, 0,
                     config_.use_feed_fetch_ops_);

-  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                          platform::errors::PreconditionNotMet(
+                              "The sub_scope should not be nullptr."));

  return true;
 }
@@ -297,7 +300,8 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
  timer.tic();
  // set feed variable
  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
-  PADDLE_ENFORCE_NOT_NULL(scope, "The scope should not be nullptr.");
+  PADDLE_ENFORCE_NOT_NULL(scope, platform::errors::PreconditionNotMet(
+                                     "The scope should not be nullptr."));
  if (!SetFeed(inputs, scope)) {
    LOG(ERROR) << "fail to set feed";
    return false;
@@ -399,7 +403,11 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
  outputs->resize(fetches_.size());
  for (size_t i = 0; i < fetches_.size(); ++i) {
    int idx = BOOST_GET_CONST(int, fetches_[i]->GetAttr("col"));
-    PADDLE_ENFORCE((size_t)idx == i);
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(idx), i,
+        platform::errors::InvalidArgument(
+            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
+            i));
    framework::FetchType &fetch_var =
        framework::GetFetchVariable(*scope, "fetch", idx);
    auto &fetch = BOOST_GET(framework::LoDTensor, fetch_var);
@@ -435,10 +443,12 @@ void AnalysisPredictor::PrepareArgument() {
  if (!config_.model_dir().empty()) {
    argument_.SetModelDir(config_.model_dir());
  } else {
-    PADDLE_ENFORCE(
-        !config_.params_file().empty(),
-        "Either model_dir or (param_file, prog_file) should be set.");
-    PADDLE_ENFORCE(!config_.prog_file().empty());
+    PADDLE_ENFORCE_EQ(config_.params_file().empty(), false,
+                      platform::errors::PreconditionNotMet(
+                          "Either model_dir or param_file should be set."));
+    PADDLE_ENFORCE_EQ(config_.prog_file().empty(), false,
+                      platform::errors::PreconditionNotMet(
+                          "Either model_dir or prog_file should be set."));
    std::string dir = inference::analysis::GetDirRoot(config_.prog_file());

    argument_.SetModelProgramPath(config_.prog_file());
@@ -503,7 +513,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  PrepareArgument();
  Analyzer().Run(&argument_);

-  PADDLE_ENFORCE(argument_.scope_valid());
+  PADDLE_ENFORCE_EQ(
+      argument_.scope_valid(), true,
+      platform::errors::InvalidArgument("The argument scope should be valid."));
  VLOG(5) << "to prepare executor";
  ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
  inference_program_.reset(
@@ -525,8 +537,10 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    FLAGS_minloglevel = 2;  // GLOG_ERROR
  }
  VLOG(3) << "create AnalysisConfig";
-  PADDLE_ENFORCE(config.is_valid(),
-                 "Note: Each config can only be used for one predictor.");
+  PADDLE_ENFORCE_EQ(
+      config.is_valid(), true,
+      platform::errors::InvalidArgument(
+          "Note: Each config can only be used for one predictor."));

  if (config.use_gpu()) {
    static std::once_flag gflags_initialized;
@@ -623,7 +637,9 @@ bool AnalysisPredictor::MkldnnQuantize() {
 }

 void AnalysisPredictor::PrepareFeedFetch() {
-  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                          platform::errors::InvalidArgument(
+                              "The sub_scope should not be nullptr."));
  CreateFeedFetchVar(sub_scope_);
  for (auto *op : inference_program_->Block(0).AllOps()) {
    if (op->Type() == "feed") {
@@ -646,7 +662,8 @@ void AnalysisPredictor::PrepareFeedFetch() {
 }

 void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
-  PADDLE_ENFORCE_NOT_NULL(scope);
+  PADDLE_ENFORCE_NOT_NULL(scope, platform::errors::InvalidArgument(
+                                     "The scope should not be nullptr."));
  auto *var = scope->Var("feed");
  var->GetMutable<framework::FeedList>();
  var = scope->Var("fetch");
@@ -667,7 +684,8 @@ AnalysisPredictor::GetInputTensorShape() {
  std::vector<std::string> names = GetInputNames();
  for (std::string name : names) {
    auto *var = inference_program_->Block(0).FindVar(name);
-    PADDLE_ENFORCE_NOT_NULL(var, "input %s does not exist.", name);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
+                                     "Input %s does not exist.", name));
    input_shapes[name] = var->GetShape();
  }
  return input_shapes;
@@ -683,7 +701,11 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {

 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
    const std::string &name) {
-  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      executor_->scope()->FindVar(name),
+      platform::errors::PreconditionNotMet(
+          "The variable named %s is not found in the scope of the exector.",
+          name));
  std::unique_ptr<ZeroCopyTensor> res(
      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
  res->input_or_output_ = true;
@@ -700,7 +722,11 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(

 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
    const std::string &name) {
-  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      executor_->scope()->FindVar(name),
+      platform::errors::PreconditionNotMet(
+          "he variable named %s is not found in the scope of the exector.",
+          name));
  std::unique_ptr<ZeroCopyTensor> res(
      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
  res->input_or_output_ = false;
@@ -761,8 +787,11 @@ bool AnalysisPredictor::LoadProgramDesc() {
    std::string pb_content;
    // Read binary
    std::ifstream fin(filename, std::ios::in | std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
-                   filename);
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fin.is_open()), true,
+        platform::errors::NotFound(
+            "Cannot open file %s, please confirm whether the file is normal.",
+            filename));
    fin.seekg(0, std::ios::end);
    pb_content.resize(fin.tellg());
    fin.seekg(0, std::ios::beg);
@@ -779,7 +808,8 @@ bool AnalysisPredictor::LoadProgramDesc() {

 bool AnalysisPredictor::LoadParameters() {
  PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
-                          "The inference program should be loaded first.");
+                          platform::errors::PreconditionNotMet(
+                              "The inference program should be loaded first."));

  const auto &global_block = inference_program_->MutableBlock(0);

@@ -855,8 +885,9 @@ void AnalysisPredictor::ClearIntermediateTensor() {

 #if PADDLE_WITH_TENSORRT
 bool AnalysisPredictor::SaveTrtCalibToDisk() {
-  PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
-                 "This func can be invoked only in trt mode");
+  PADDLE_ENFORCE_EQ(config_.tensorrt_engine_enabled(), true,
+                    platform::errors::PreconditionNotMet(
+                        "This func can be invoked only in trt mode"));
  auto &block = inference_program_->Block(0);
  for (auto &op_desc : block.AllOps()) {
    if (op_desc->Type() == "tensorrt_engine") {

--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -62,9 +62,9 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
    if (other.length() && other.data())
      memcpy(data_, other.data(), other.length());
    else if (other.length())
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::InvalidArgument(
          "Invalid argument, null pointer data with length %u is passed",
-          other.length());
+          other.length()));

    length_ = other.length();
    memory_owned_ = true;
@@ -92,7 +92,8 @@ void PaddleBuf::Resize(size_t length) {
    length_ = length;
    memory_owned_ = true;
  } else {
-    PADDLE_THROW("The memory is allocated externally, can not Resized");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "The memory is allocated externally, can not Resized"));
  }
 }

@@ -105,7 +106,11 @@ void PaddleBuf::Reset(void *data, size_t length) {

 void PaddleBuf::Free() {
  if (memory_owned_ && data_) {
-    PADDLE_ENFORCE_GT(length_, 0UL);
+    PADDLE_ENFORCE_GT(
+        length_, 0UL,
+        platform::errors::PreconditionNotMet(
+            "The memory used in PaddleBuf %d should be greater than 0",
+            length_));
    delete[] static_cast<char *>(data_);
    data_ = nullptr;
    length_ = 0;

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -87,7 +87,9 @@ bool NativePaddlePredictor::Init(
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());
-    PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail");
+    PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                            platform::errors::PreconditionNotMet(
+                                "The sub_scope should not be nullptr."));
  } else {
    paddle::framework::InitDevices(false);
    scope_.reset(new paddle::framework::Scope());
@@ -182,7 +184,10 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
  // Hot fix the bug that result diff in multi-thread.
  // TODO(Superjomn) re-implement a real clone here.
-  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<NativePaddlePredictor *>(cls.get()));
+  PADDLE_ENFORCE_NOT_NULL(
+      dynamic_cast<NativePaddlePredictor *>(cls.get()),
+      platform::errors::PreconditionNotMet(
+          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
    LOG(ERROR) << "fail to call Init";
    return nullptr;
@@ -224,8 +229,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
      return false;
    }

-    PADDLE_ENFORCE_NOT_NULL(input_ptr);
-    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
+    PADDLE_ENFORCE_NOT_NULL(input_ptr,
+                            platform::errors::InvalidArgument(
+                                "The input_ptr should not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(
+        inputs[i].data.data(),
+        platform::errors::InvalidArgument(
+            "The data of input tensor should not be null."));
    if (platform::is_cpu_place(place_)) {
      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
@@ -241,7 +251,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                   platform::CPUPlace(), inputs[i].data.data(),
                   inputs[i].data.length(), dev_ctx->stream());
 #else
-      PADDLE_THROW("Not compile with CUDA, should not reach here.");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Not compile with CUDA, should not reach here."));
 #endif
    }

@@ -287,7 +298,11 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
  outputs->resize(fetchs_.size());
  for (size_t i = 0; i < fetchs_.size(); ++i) {
    int idx = BOOST_GET_CONST(int, fetchs_[i]->GetAttr("col"));
-    PADDLE_ENFORCE((size_t)idx == i);
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(idx), i,
+        platform::errors::InvalidArgument(
+            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
+            i));
    framework::FetchType &fetch_var =
        framework::GetFetchVariable(*scope, "fetch", idx);
    auto fetch = BOOST_GET_CONST(framework::LoDTensor, fetch_var);
@@ -318,10 +333,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
  VLOG(3) << "create NativePaddlePredictor";
  if (config.use_gpu) {
    // 1. GPU memory
-    PADDLE_ENFORCE_GE(
-        config.fraction_of_gpu_memory, 0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
-    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    PADDLE_ENFORCE_GE(config.fraction_of_gpu_memory, 0.f,
+                      platform::errors::InvalidArgument(
+                          "fraction_of_gpu_memory in the config should be set "
+                          "to range (0., 1.]"));
+    PADDLE_ENFORCE_GE(config.device, 0,
+                      platform::errors::PreconditionNotMet(
+                          "Invalid device id %d, the device id should be "
+                          "greater than or equal to 0.",
+                          config.device));
    std::vector<std::string> flags;
    if (config.fraction_of_gpu_memory >= 0.0f ||
        config.fraction_of_gpu_memory <= 0.95f) {
@@ -336,7 +356,9 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<

  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
  PADDLE_ENFORCE_NOT_NULL(
-      dynamic_cast<NativePaddlePredictor *>(predictor.get()));
+      dynamic_cast<NativePaddlePredictor *>(predictor.get()),
+      platform::errors::PreconditionNotMet(
+          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
    return nullptr;
  }

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -112,16 +112,19 @@ static T convert(const std::string &item,
    std::string message =
        "invalid_argument exception when try to convert : " + item;
    LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "invalid_argument exception when try to convert %s.", item));
  } catch (std::out_of_range &e) {
    std::string message =
        "out_of_range exception when try to convert : " + item;
    LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "out_of_range exception when try to convert %s.", item));
  } catch (...) {
    std::string message = "unexpected exception when try to convert " + item;
    LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "unexpected exception when try to convert %s.", item));
  }
  return res;
 }
@@ -353,7 +356,8 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                      double batch_latency, int epoch = 1,
                      const framework::proto::VarType::Type data_type =
                          framework::proto::VarType::FP32) {
-  PADDLE_ENFORCE_GT(batch_size, 0, "Non-positive batch size.");
+  PADDLE_ENFORCE_GT(batch_size, 0, platform::errors::InvalidArgument(
+                                       "Non-positive batch size."));
  double sample_latency = batch_latency / batch_size;
  LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
            << " ======";

--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -62,9 +62,12 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
            if (scales_.find(var_name) != scales_.end()) continue;

            auto* var = predictor_.sub_scope_->FindVar(var_name);
-            PADDLE_ENFORCE(var, "%s is not in the scope", var_name);
-            PADDLE_ENFORCE(var->IsType<LoDTensor>(),
-                           "Only support lod tensor now.");
+            PADDLE_ENFORCE_NOT_NULL(var,
+                                    platform::errors::PreconditionNotMet(
+                                        "%s is not in the scope", var_name));
+            PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "Only support lod tensor now."));
            LoDTensor* var_tensor = var->GetMutable<LoDTensor>();

            // force unsigned type if already know it
@@ -82,9 +85,11 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
              } else if (op->Type() == "transpose2" ||
                         op->Type() == "reshape2" || op->Type() == "pool2d") {
                auto input_var_name = op->Input("X")[0];
-                PADDLE_ENFORCE(scales_.find(input_var_name) != scales_.end(),
+                PADDLE_ENFORCE_NE(
+                    scales_.find(input_var_name), scales_.end(),
+                    platform::errors::PreconditionNotMet(
                        "Input scales must be calculated before the "
-                               "output scales to infer if output is unsigned.");
+                        "output scales to infer if output is unsigned."));
                if (scales_.find(input_var_name) != scales_.end()) {
                  scales_[var_name] = scales_[input_var_name];
                }
@@ -94,10 +99,11 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
                is_unsigned = true;
                double min_scale = std::numeric_limits<double>::max();
                for (auto input_var_name : op->Input("X")) {
-                  PADDLE_ENFORCE(
-                      scales_.find(input_var_name) != scales_.end(),
+                  PADDLE_ENFORCE_NE(
+                      scales_.find(input_var_name), scales_.end(),
+                      platform::errors::PreconditionNotMet(
                          "Input scales must be calculated before the "
-                      "output scales to infer if output is unsigned.");
+                          "output scales to infer if output is unsigned."));
                  is_unsigned = is_unsigned && scales_[input_var_name].first;
                  min_scale = std::min(
                      min_scale,
@@ -132,11 +138,12 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
  auto rule = qconfig_->scale_algo(op_type_name, conn_name);
  if (rule == ScaleAlgo::NONE) return;

-  PADDLE_ENFORCE(
-      var_tensor.numel() > 0,
+  PADDLE_ENFORCE_GT(
+      var_tensor.numel(), 0,
+      platform::errors::InvalidArgument(
          "MkldnnQuantizer: LoDTensor of variable %s for quantization of op "
          "%s of connection %s should not be empty.",
-      var_name, op_type_name, conn_name);
+          var_name, op_type_name, conn_name));

  switch (rule) {
    case ScaleAlgo::MAX:
@@ -205,10 +212,11 @@ AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor(
  float min_val = eigen_tensor.minCoeff();
  bool is_positive = min_val >= 0.0f;
  if (is_unsigned)
-    PADDLE_ENFORCE(
-        is_positive,
+    PADDLE_ENFORCE_EQ(
+        is_positive, true,
+        platform::errors::InvalidArgument(
            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+            min_val));

  int num_quantized_bins = 255;

@@ -316,10 +324,11 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
  float max_abs = eigen_tensor.abs().maxCoeff();
  float min_val = eigen_tensor.minCoeff();
  if (is_unsigned)
-    PADDLE_ENFORCE(
-        min_val >= 0.0f,
+    PADDLE_ENFORCE_GE(
+        min_val, 0.0f,
+        platform::errors::InvalidArgument(
            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+            min_val));

  LoDTensor scale_tensor = CreateScaleTensor();
  scale_tensor.data<double>()[0] = 1.0 / max_abs;
@@ -330,16 +339,19 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
 std::pair<bool, LoDTensor>
 AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
    const LoDTensor& var_tensor, bool is_unsigned, bool is_transposed) const {
-  PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty.");
+  PADDLE_ENFORCE_GT(
+      var_tensor.dims().size(), 0,
+      platform::errors::InvalidArgument("Tensor dimension is empty."));

  ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
                                        var_tensor.numel(), 1};
  float min_val = eigen_tensor.minCoeff();
  if (is_unsigned)
-    PADDLE_ENFORCE(
-        min_val >= 0.0f,
+    PADDLE_ENFORCE_GE(
+        min_val, 0.0f,
+        platform::errors::InvalidArgument(
            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+            min_val));

  auto dims = var_tensor.dims();
  constexpr int num_col_dims = 1;
@@ -367,17 +379,19 @@ AnalysisPredictor::MkldnnQuantizer::Histogram(
    const framework::LoDTensor& var_tensor, float min_val, float max_val,
    size_t num_bins) const {
  PADDLE_ENFORCE_GT(num_bins, 0,
+                    platform::errors::InvalidArgument(
                        "MkldnnQuantizer: To calculate Histogram, num_bins (" +
-                        std::to_string(num_bins) + ") must be positive.");
-  PADDLE_ENFORCE_GT(
-      var_tensor.numel(), 0,
-      "MkldnnQuantizer: To calculate Histogram, the tensor must not be empty.");
-  PADDLE_ENFORCE(max_val >= min_val,
+                        std::to_string(num_bins) + ") must be positive."));
+  PADDLE_ENFORCE_GT(var_tensor.numel(), 0,
+                    platform::errors::InvalidArgument(
+                        "MkldnnQuantizer: To calculate Histogram, the tensor "
+                        "must not be empty."));
+  PADDLE_ENFORCE_GE(max_val, min_val,
+                    platform::errors::InvalidArgument(
                        "MkldnnQuantizer: To calculate Histogram, max_val (" +
-                     std::to_string(max_val) +
-                     ") must be greater or equal"
+                        std::to_string(max_val) + ") must be greater or equal"
                                                  "to min_val (" +
-                     std::to_string(min_val) + ").");
+                        std::to_string(min_val) + ")."));
  ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
                                        var_tensor.numel(), 1};
  auto bin_width = std::abs(max_val - min_val) / num_bins;
@@ -407,7 +421,8 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
  auto graph = std::unique_ptr<Graph>(new Graph(arg.main_program()));
  arg.SetMainGraph(graph.release());
  auto* scope_ptr = arg.scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
+  PADDLE_ENFORCE_NOT_NULL(scope_ptr, platform::errors::PreconditionNotMet(
+                                         "The scope should not be nullptr."));
  arg.main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);

  auto* builder = predictor_.config_.pass_builder();
@@ -441,7 +456,9 @@ bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const {
  PrepareArgument();
  auto& arg = predictor_.argument_;
  Analyzer().Run(&arg);
-  PADDLE_ENFORCE(arg.scope_valid());
+  PADDLE_ENFORCE_EQ(
+      arg.scope_valid(), true,
+      platform::errors::PreconditionNotMet("The scope should be valid."));
  VLOG(5) << "to prepare executor";
  ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program);
  predictor_.inference_program_.reset(
@@ -456,7 +473,8 @@ bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
  VLOG(3) << "Predictor: run a quantization warmup iteration";
  auto warmup_data = qconfig_->warmup_data();
  PADDLE_ENFORCE_NOT_NULL(warmup_data,
-                          "Warmup data cannot be NULL in the config.");
+                          platform::errors::PreconditionNotMet(
+                              "Warmup data cannot be NULL in the config."));
  PrettyLogH1("--- Running warmup iteration for quantization");

  // Run the inference program
@@ -469,7 +487,10 @@ bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
 float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
    std::vector<int> reference_distr_P, int P_sum,
    std::vector<int> candidate_distr_Q, int Q_sum) const {
-  PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size());
+  PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size(),
+                    platform::errors::InvalidArgument(
+                        "The P size %d should be equal to Q size %d",
+                        reference_distr_P.size(), candidate_distr_Q.size()));
  float tmp_sum1 = 0;
  float tmp_sum2 = 0;
  for (size_t idx = 0; idx < reference_distr_P.size(); idx++) {
@@ -479,10 +500,11 @@ float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
      tmp_sum1 += 0;
      tmp_sum2 += 0;
    } else {
-      PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " +
-                                     std::to_string(idx) +
-                                     " qindex = 0! p_idx = " +
-                                     std::to_string(p_idx));
+      PADDLE_ENFORCE_NE(
+          q_idx, 0,
+          platform::errors::PreconditionNotMet(
+              "MkldnnQuantizer: Fatal error!, idx = " + std::to_string(idx) +
+              " qindex = 0! p_idx = " + std::to_string(p_idx)));
    }
    tmp_sum1 += p_idx * (log(Q_sum * p_idx));
    tmp_sum2 += p_idx * (log(P_sum * q_idx));

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -231,6 +231,10 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {

 void CpuPassStrategy::EnableMkldnnBfloat16() {
 #ifdef PADDLE_WITH_MKLDNN
+  if (!use_mkldnn_bfloat16_) {
+    passes_.push_back("cpu_bfloat16_placement_pass");
+    passes_.push_back("cpu_bfloat16_pass");
+  }
  use_mkldnn_bfloat16_ = true;
 #else
  use_mkldnn_bfloat16_ = false;

--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -34,8 +34,11 @@ class ConcatOpConverter : public OpConverter {
      itensors.push_back(engine_->GetITensor(input_name));
    }
    int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
-    PADDLE_ENFORCE(axis > 0,
-                   "The axis attr of Concat op should be large than 0 for trt");
+    PADDLE_ENFORCE_GT(axis, 0, platform::errors::InvalidArgument(
+                                   "The axis attr of Concat"
+                                   " op should be larger than 0 for trt. "
+                                   "But received %d.",
+                                   axis));

    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(),
                                       itensors.size());

--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -100,7 +100,9 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
  TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
  auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
                           nv_ksize, weight, bias);
-  PADDLE_ENFORCE(layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(layer,
+                          platform::errors::Fatal("TensorRT create conv2d"
+                                                  " layer error."));
  layer->setStride(nv_strides);
  layer->setPadding(nv_paddings);
  layer->setNbGroups(groups);

--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -43,13 +43,30 @@ class ElementwiseWeightOpConverter : public OpConverter {
    framework::OpDesc op_desc(op, nullptr);
    VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";

-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("X").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"X\").size() "
+            "should equal to 1, but received Input(\"X\").size() = %u.",
+            op_desc.Input("X").size()));
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("Y").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"Y\").size() "
+            "should equal to 1, but received Input(\"Y\").size() = %u.",
+            op_desc.Input("Y").size()));  // Y is a weight
+    PADDLE_ENFORCE_EQ(
+        op_desc.Output("Out").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Output(\"Out\").size() "
+            "should equal to 1, but reveceid Output(\"Out\").size() = %u.",
+            op_desc.Output("Out").size()));

    auto* X = engine_->GetITensor(op_desc.Input("X").front());
    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
-    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    PADDLE_ENFORCE_NOT_NULL(
+        Y_v, platform::errors::NotFound("Variable %s not found in scope.",
+                                        op_desc.Input("Y").front().c_str()));
    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
    float* weight_data = nullptr;
    weight_data =
@@ -176,9 +193,24 @@ class ElementwiseTensorOpConverter : public OpConverter {
    framework::OpDesc op_desc(op, nullptr);
    nvinfer1::ILayer* layer = nullptr;

-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("X").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"X\").size() "
+            "should equal to 1, but received Input(\"X\").size() = %u.",
+            op_desc.Input("X").size()));
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("Y").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"Y\").size() "
+            "should equal to 1, but received Input(\"Y\").size() = %u.",
+            op_desc.Input("Y").size()));  // Y is a weight
+    PADDLE_ENFORCE_EQ(
+        op_desc.Output("Out").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Output(\"Out\").size() "
+            "should equal to 1, but received Output(\"Out\").size() = %u.",
+            op_desc.Output("Out").size()));

    auto* X = engine_->GetITensor(op_desc.Input("X").front());
    auto* Y = engine_->GetITensor(op_desc.Input("Y").front());

--- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -29,38 +29,67 @@ class DefaultIOConverter : public EngineIOConverter {
  // NOTE out is GPU memory.
  virtual void operator()(const LoDTensor& in, void* out,
                          size_t max_size) override {
-    PADDLE_ENFORCE(out != nullptr);
-    PADDLE_ENFORCE(stream_ != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(out,
+                            platform::errors::InvalidArgument(
+                                "The input param 'out' must not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(stream_,
+                            platform::errors::PreconditionNotMet(
+                                "You should set up stream_ by SetStream() "
+                                "before you call the operator()."));
    const auto& place = in.place();
    size_t size = in.memory_size();
-    PADDLE_ENFORCE_LE(size, max_size);
+    PADDLE_ENFORCE_LE(
+        size, max_size,
+        platform::errors::InvalidArgument(
+            "The input Tensor in's memory_size shoule be less than or equal to "
+            "the input max_size. But in's memory_size = %u, max_size = %u.",
+            size, max_size));
    if (is_cpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
-                                           cudaMemcpyHostToDevice, *stream_));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+          out, in.data<float>(), size, cudaMemcpyHostToDevice, *stream_));
    } else if (is_gpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
-                                           cudaMemcpyDeviceToDevice, *stream_));
+      PADDLE_ENFORCE_EQ(
+          0, cudaMemcpyAsync(out, in.data<float>(), size,
+                             cudaMemcpyDeviceToDevice, *stream_),
+          platform::errors::External(
+              "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
    } else {
-      PADDLE_THROW("Unknown device for converter");
+      PADDLE_THROW(platform::errors::NotFound("Unknown device for converter"));
    }
    cudaStreamSynchronize(*stream_);
  }
  // NOTE in is GPU memory.
  virtual void operator()(const void* in, LoDTensor* out,
                          size_t max_size) override {
-    PADDLE_ENFORCE(in != nullptr);
-    PADDLE_ENFORCE(stream_ != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(in,
+                            platform::errors::InvalidArgument(
+                                "The input param 'in' must not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(stream_,
+                            platform::errors::PreconditionNotMet(
+                                "You should set up stream_ by SetStream() "
+                                "before you call the operator()."));
    const auto& place = out->place();
    size_t size = out->memory_size();
-    PADDLE_ENFORCE_LE(size, max_size);
+    PADDLE_ENFORCE_LE(
+        size, max_size,
+        platform::errors::InvalidArgument(
+            "The input Tensor out's memory_size shoule be less than or equal "
+            "to the input max_size. "
+            "But out's memory_size = %u, max_size = %u.",
+            size, max_size));
    if (is_cpu_place(place)) {
      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToHost, *stream_));
+                                           cudaMemcpyDeviceToHost, *stream_),
+                        platform::errors::External(
+                            "cudaMemcpyAsync(cudaMemcpyDeviceToHost) error."));
    } else if (is_gpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToDevice, *stream_));
+      PADDLE_ENFORCE_EQ(
+          0, cudaMemcpyAsync(out->data<float>(), in, size,
+                             cudaMemcpyDeviceToDevice, *stream_),
+          platform::errors::External(
+              "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
    } else {
-      PADDLE_THROW("Unknown device for converter");
+      PADDLE_THROW(platform::errors::NotFound("Unknown device for converter"));
    }
    cudaStreamSynchronize(*stream_);
  }

--- a/paddle/fluid/inference/tensorrt/convert/io_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
@@ -44,10 +44,14 @@ class EngineIOConverter {

  static void ConvertInput(const std::string& op_type, const LoDTensor& in,
                           void* out, size_t max_size, cudaStream_t* stream) {
-    PADDLE_ENFORCE(stream != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(stream,
+                            platform::errors::InvalidArgument(
+                                "The input stream must not be nullptr."));
    auto* converter = Registry<EngineIOConverter>::Global().Lookup(
        op_type, "default" /* default_type */);
-    PADDLE_ENFORCE_NOT_NULL(converter);
+    PADDLE_ENFORCE_NOT_NULL(
+        converter, platform::errors::Unimplemented(
+                       "The %s in is not supported yet.", op_type.c_str()));
    converter->SetStream(stream);
    (*converter)(in, out, max_size);
  }
@@ -55,10 +59,14 @@ class EngineIOConverter {
  static void ConvertOutput(const std::string& op_type, const void* in,
                            LoDTensor* out, size_t max_size,
                            cudaStream_t* stream) {
-    PADDLE_ENFORCE(stream != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(stream,
+                            platform::errors::InvalidArgument(
+                                "The input stream must not be nullptr."));
    auto* converter = Registry<EngineIOConverter>::Global().Lookup(
        op_type, "default" /* default_type */);
-    PADDLE_ENFORCE_NOT_NULL(converter);
+    PADDLE_ENFORCE_NOT_NULL(
+        converter, platform::errors::Unimplemented(
+                       "The %s in not supported yet.", op_type.c_str()));
    converter->SetStream(stream);
    (*converter)(in, out, max_size);
  }

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -53,7 +53,12 @@ class OpConverter {
    OpConverter* it{nullptr};

    if (op_desc.Type() == "mul") {
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "The input op mul's Input(\"Y\")."
+                            "size() should equal to 1, but reveceid "
+                            "Input(\"Y\").size() = %u.",
+                            op_desc.Input("Y").size()));
      std::string Y = op_desc.Input("Y")[0];
      if (parameters.count(Y)) {
        it = Registry<OpConverter>::Global().Lookup("fc");
@@ -66,38 +71,51 @@ class OpConverter {
      // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
      // "sub", "div"};
      static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "The input op's Input(\"Y\")."
+                            "size() should equal to 1, but reveceid "
+                            "Input(\"Y\").size() = %u.",
+                            op_desc.Input("Y").size()));
      int op_type_len = op_desc.Type().size();
      std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
      std::string Y = op_desc.Input("Y")[0];
      if (parameters.count(Y)) {
-        PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
-                       "Unsupported elementwise type" + op_type);
+        PADDLE_ENFORCE_GT(
+            add_weight_op_set.count(op_type), 0,
+            platform::errors::Unimplemented("Unsupported elementwise type %s",
+                                            op_type.c_str()));
        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
                                                    "_weight");
-        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                                op_desc.Type());
+        PADDLE_ENFORCE_NOT_NULL(
+            it, platform::errors::Unimplemented(
+                    "no OpConverter for optype [%s]", op_desc.Type()));
      } else {
-        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
-                       "Unsupported elementwise type" + op_type);
+        PADDLE_ENFORCE_GT(
+            add_tensor_op_set.count(op_type), 0,
+            platform::errors::Unimplemented("Unsupported elementwise type %s",
+                                            op_type.c_str()));
        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
                                                    "_tensor");
      }
-      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                              op_desc.Type());
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
    }

    if (op_desc.Type() == "depthwise_conv2d") {
      it = Registry<OpConverter>::Global().Lookup("conv2d");
-      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                              op_desc.Type());
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
    }

    if (!it) {
      it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
    }
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                            op_desc.Type());
+    PADDLE_ENFORCE_NOT_NULL(
+        it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                            op_desc.Type()));

    it->SetEngine(engine);
    (*it)(op, scope, test_mode);
@@ -149,9 +167,13 @@ class OpConverter {
    for (auto& input : inputs) {
      if (parameters.count(input)) continue;
      auto* var = block_desc->FindVar(input);
-      PADDLE_ENFORCE(var, "no variable called %s", input);
-      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                        "TensorRT engine only takes LoDTensor as input");
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound("no variable called %s in block.",
+                                          input.c_str()));
+      PADDLE_ENFORCE_EQ(
+          var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+          platform::errors::InvalidArgument("TensorRT engine only takes "
+                                            "LoDTensor as input"));
      auto var_shape = var->GetShape();
      if (engine->with_dynamic_shape()) {
 #if IS_TRT_VERSION_GE(6000)

--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -39,9 +39,22 @@ class PadOpConverter : public OpConverter {
    nvinfer1::Dims input_shape = input->getDimensions();
    int nbDims = input_shape.nbDims;
    int pad_size = static_cast<int>(paddings.size());
-    PADDLE_ENFORCE_GE(nbDims, 2);
-    PADDLE_ENFORCE_EQ((nbDims + 1) * 2, pad_size);
-    PADDLE_ENFORCE(pad_value == 0.0, "The pad layer of TRT only support zero.");
+    PADDLE_ENFORCE_GE(
+        nbDims, 2,
+        platform::errors::InvalidArgument(
+            "Input X[0]'s dimension should greater than or equal to 2. "
+            "But received %d.",
+            nbDims));
+    PADDLE_ENFORCE_EQ(
+        (nbDims + 1) * 2, pad_size,
+        platform::errors::InvalidArgument("Input X[0]'s dimension(nbDims for "
+                                          "short) should meet the condition:"
+                                          "(nbDims + 1) * 2 == pad_size. But "
+                                          "received nbDims:%d, pad_size:%d.",
+                                          nbDims, pad_size));
+    PADDLE_ENFORCE_EQ(pad_value, 0.0,
+                      platform::errors::InvalidArgument(
+                          "The pad layer of TRT only support zero."));

    nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
    nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
@@ -50,7 +63,9 @@ class PadOpConverter : public OpConverter {
                                       *const_cast<nvinfer1::ITensor*>(input),
                                       pre_pad, post_pad);

-    PADDLE_ENFORCE(layer != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(layer,
+                            platform::errors::External(
+                                "add padding layer to tensorrt engine error"));
    auto output_name = op_desc.Output("Out")[0];
    RreplenishLayerAndOutput(layer, "pad", {output_name}, test_mode);
  }

--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -23,9 +23,8 @@ class SliceOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-// This OP is implemented by trt dynamic shpae plugin.
-// Dynamic shape plugin requires TRT version greater than 6.0.
-#if IS_TRT_VERSION_GE(6000)
+    // This OP is implemented by trt dynamic shpae plugin.
+    // Dynamic shape plugin requires TRT version greater than 6.0.
    VLOG(4) << "convert slice op to tensorrt layer";
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs
@@ -38,27 +37,65 @@ class SliceOpConverter : public OpConverter {
    std::vector<int> ends =
        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("ends"));

+    PADDLE_ENFORCE_EQ(
+        starts.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of starts must be equal to the size of axes."));
+    PADDLE_ENFORCE_EQ(
+        ends.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of ends must be equal to the size of axes."));
+
+    auto input_dims = input->getDimensions();
+    if (!engine_->with_dynamic_shape()) {
+      // notice that input shape is [CHW] without batch axis when input has
+      // static shape
+      for (size_t i = input_dims.nbDims; i > 0; i--) {
+        input_dims.d[i] = input_dims.d[i - 1];
+      }
+      input_dims.d[0] = 1;  // fake batchsize, not useful here
+      for (size_t i = 0; i < axes.size(); i++) {
+        // split on batch is not supported in TensorRT
+        PADDLE_ENFORCE_NE(axes[i], 0, platform::errors::InvalidArgument(
+                                          "Invalid slice axis. Slice on batch "
+                                          "axis is not supported in TensorRT"));
+        if (starts[i] < 0) {
+          starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0);
+        }
+        if (ends[i] < 0) {
+          ends[i] = std::max(ends[i] + input_dims.d[axes[i]], 0);
+        }
+        ends[i] = std::min(ends[i], input_dims.d[axes[i]]);
+        PADDLE_ENFORCE_GT(
+            ends[i], starts[i],
+            platform::errors::InvalidArgument(
+                "Attr(ends) should be greater than attr(starts) in "
+                "slice op. But received ends = %d, starts = %d.",
+                ends[i], starts[i]));
+      }
+    }
+
    nvinfer1::ILayer* layer = nullptr;
    if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
      bool ban_fp16 = engine_->disable_trt_plugin_fp16();
      plugin::SlicePluginDynamic* plugin =
-          new plugin::SlicePluginDynamic(starts, ends, ends, ban_fp16);
+          new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16);
      layer = engine_->AddPluginV2(&input, 1, plugin);
-    } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
-    }
-
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "skip_layernorm", {output_name}, test_mode);
 #else
      PADDLE_THROW(platform::errors::Fatal(
          "You are running the TRT Dynamic Shape mode, need to confirm that "
          "your TRT version is no less than 6.0"));
 #endif
+    } else {
+      bool ban_fp16 = engine_->disable_trt_plugin_fp16();
+      plugin::SlicePlugin* plugin =
+          new plugin::SlicePlugin(starts, ends, axes, ban_fp16);
+      layer = engine_->AddPlugin(&input, 1, plugin);
+    }
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "slice", {output_name}, test_mode);
  }
 };


--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -28,11 +28,20 @@ class SwishOpConverter : public OpConverter {
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs
    int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE(input_num == 1);
+    PADDLE_ENFORCE_EQ(input_num, 1,
+                      platform::errors::InvalidArgument(
+                          "The input X's size must equal to 1 in TRT swish op."
+                          " But received X's size %d.",
+                          input_num));
    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
    // Get output
    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE(output_num == 1);
+    PADDLE_ENFORCE_EQ(
+        output_num, 1UL,
+        platform::errors::InvalidArgument(
+            "The ouput Out's size must equal to 1 in TRT swish op. "
+            "But received Out's size %u.",
+            output_num));
    // Get attrs
    float beta = BOOST_GET_CONST(float, op_desc.GetAttr("beta"));


--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -49,7 +49,10 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
                     const platform::DeviceContext& ctx) {
  auto dims = tensor->dims();
  size_t num_elements = analysis::AccuDims(dims, dims.size());
-  PADDLE_ENFORCE_GT(num_elements, 0);
+  PADDLE_ENFORCE_GT(
+      num_elements, 0UL,
+      platform::errors::PermissionDenied("RandomizeTensor only can be used for "
+                                         "tensor which dims is not zero."));

  platform::CPUPlace cpu_place;
  framework::LoDTensor temp_tensor;
@@ -79,7 +82,8 @@ class TRTConvertValidation {
        scope_(scope),
        if_add_batch_(if_add_batch),
        max_batch_size_(max_batch_size) {
-    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0,
+                      platform::errors::External("cudaStreamCreate error."));
    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size));
    engine_->InitNetwork();
  }
@@ -154,7 +158,12 @@ class TRTConvertValidation {
  void Execute(int batch_size,
               std::unordered_set<std::string> neglected_output = {}) {
    // Execute Fluid Op
-    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
+    PADDLE_ENFORCE_LE(batch_size, max_batch_size_,
+                      platform::errors::InvalidArgument(
+                          "Runtime batch_size should be less than or equal to "
+                          "max_batch_size_. "
+                          "But received batch_size:%d, max_batch_size_:%d",
+                          batch_size, max_batch_size_));
    platform::CUDADeviceContext ctx(place_);
    op_->Run(scope_, place_);
    cudaStreamSynchronize(stream_);

--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -31,6 +31,7 @@ struct SimpleOpTypeSetTeller : public Teller {
    teller_set.insert("fused_embedding_eltwise_layernorm");
    teller_set.insert("multihead_matmul");
    teller_set.insert("skip_layernorm");
+    teller_set.insert("slice");
 #endif
  }


--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
@@ -26,8 +26,10 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {

-// Dynamic Plugin below.
-#if IS_TRT_VERSION_GE(6000)
+SlicePlugin *CreateSlicePluginDeserialize(const void *buffer, size_t length) {
+  return new SlicePlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("slice_plugin", CreateSlicePluginDeserialize);

 template <typename T>
 __global__ void SliceKernel(int num, int dims, const T *input,
@@ -56,11 +58,196 @@ __global__ void SliceKernel(int num, int dims, const T *input,
  }
 }

+SlicePlugin::SlicePlugin(std::vector<int> starts, std::vector<int> ends,
+                         std::vector<int> axes, bool ban_fp16)
+    : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
+  deserializeBase(serial_data, serial_length);
+  DeserializeValue(&serial_data, &serial_length, &starts_);
+  DeserializeValue(&serial_data, &serial_length, &ends_);
+  DeserializeValue(&serial_data, &serial_length, &axes_);
+  DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePlugin::~SlicePlugin() {
+  cudaStreamDestroy(copy_stream_);
+  cudaEventDestroy(copy_event_);
+  cudaFree(offset_temp_data_);
+}
+
+SlicePlugin *SlicePlugin::clone() const {
+  return new SlicePlugin(starts_, ends_, axes_, ban_fp16_);
+}
+
+bool SlicePlugin::supportsFormat(nvinfer1::DataType type,
+                                 nvinfer1::PluginFormat format) const {
+#ifdef SUPPORTS_CUDA_FP16
+  return ((type == nvinfer1::DataType::kFLOAT ||
+           type == nvinfer1::DataType::kHALF) &&
+          (format == nvinfer1::PluginFormat::kNCHW));
+#else
+  return ((type == nvinfer1::DataType::kFLOAT) &&
+          (format == nvinfer1::PluginFormat::kNCHW));
+#endif
+}
+
+nvinfer1::Dims SlicePlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims *inputs,
+                                                int nb_input_dims) {
+  auto in_dims = inputs[0];
+  nvinfer1::Dims out_dims = in_dims;
+  for (size_t i = 0; i < axes_.size(); i++) {
+    int start = starts_[i];
+    int end = ends_[i];
+    out_dims.d[axes_[i] - 1] = end - start;
+  }
+  return out_dims;
+}
+
+int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
+                         void **outputs, void *workspace, cudaStream_t stream) {
+  auto input_dims = getInputDims(0);
+
+  // notice input dims is [C, H, W], add input batch dim here
+  auto out_dims = getOutputDimensions(0, &input_dims, 1);
+  input_dims.nbDims += 1;
+  out_dims.nbDims += 1;
+  for (auto i = input_dims.nbDims; i > 0; --i) {
+    input_dims.d[i] = input_dims.d[i - 1];
+    out_dims.d[i] = out_dims.d[i - 1];
+  }
+  input_dims.d[0] = batch_size;
+  out_dims.d[0] = batch_size;
+
+  auto num_dims = input_dims.nbDims;
+  size_t out_num = ProductDim(out_dims);
+
+  std::vector<int> seg_offsets;
+  std::vector<int> offsets;
+  std::vector<int> extends;
+
+  offsets.resize(num_dims);
+  extends.resize(num_dims);
+  seg_offsets.resize(num_dims);
+
+  seg_offsets[num_dims - 1] = 1;
+  for (int i = num_dims - 2; i >= 0; i--) {
+    seg_offsets[i] = input_dims.d[i + 1] * seg_offsets[i + 1];
+  }
+  for (size_t i = 0; i < num_dims; ++i) {
+    offsets[i] = 0;
+    extends[i] = out_dims.d[i];
+  }
+  for (size_t i = 0; i < axes_.size(); ++i) {
+    offsets[axes_[i]] = starts_[i];
+  }
+
+  std::vector<int> offset_info;
+  for (size_t i = 0; i < num_dims; ++i) {
+    offset_info.push_back(offsets[i]);
+    offset_info.push_back(extends[i]);
+    offset_info.push_back(seg_offsets[i]);
+  }
+
+  if (offset_temp_data_ == nullptr) {
+    cudaMalloc(&offset_temp_data_, 3 * num_dims * sizeof(int));
+  }
+
+  cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
+                  copy_stream_);
+
+  cudaEventRecord(copy_event_, copy_stream_);
+  cudaStreamWaitEvent(stream, copy_event_, 0);
+
+  int threads = 256;
+  int blocks = (out_num + threads - 1) / threads;
+  auto input_type = getDataType();
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    const float *input1 = static_cast<const float *>(inputs[0]);
+    float *output = static_cast<float *>(outputs[0]);
+    SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
+        out_num, num_dims, input1, offset_temp_data_, output);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+#ifdef SUPPORTS_CUDA_FP16
+    const half *input1 = static_cast<const half *>(inputs[0]);
+    half *output = static_cast<half *>(outputs[0]);
+    SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
+        out_num, num_dims, input1, offset_temp_data_, output);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The cuda archs you specific should greater than 600."));
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "The Slice TRT Plugin's input type should be float or half."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
+size_t SlicePlugin::getSerializationSize() {
+  return getBaseSerializationSize() + SerializedSize(getPluginType()) +
+         SerializedSize(starts_) + SerializedSize(ends_) +
+         SerializedSize(axes_) + SerializedSize(ban_fp16_);
+}
+
+void SlicePlugin::serialize(void *buffer) {
+  SerializeValue(&buffer, getPluginType());
+  serializeBase(buffer);
+  SerializeValue(&buffer, starts_);
+  SerializeValue(&buffer, ends_);
+  SerializeValue(&buffer, axes_);
+  SerializeValue(&buffer, ban_fp16_);
+}
+
+// Dynamic Plugin below.
+#if IS_TRT_VERSION_GE(6000)
+SlicePluginDynamic::SlicePluginDynamic(std::vector<int> starts,
+                                       std::vector<int> ends,
+                                       std::vector<int> axes, bool ban_fp16)
+    : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
+                                       size_t serialLength) {
+  DeserializeValue(&serialData, &serialLength, &starts_);
+  DeserializeValue(&serialData, &serialLength, &ends_);
+  DeserializeValue(&serialData, &serialLength, &axes_);
+  DeserializeValue(&serialData, &serialLength, &ban_fp16_);
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+void SlicePluginDynamic::destroy() {
+  cudaStreamDestroy(copy_stream_);
+  cudaEventDestroy(copy_event_);
+  cudaFree(offset_temp_data_);
+  delete this;
+}
+
 int SlicePluginDynamic::initialize() { return 0; }

-size_t SlicePluginDynamic::getSerializationSize() const { return 0; }
+size_t SlicePluginDynamic::getSerializationSize() const {
+  size_t size = SerializedSize(starts_) + SerializedSize(ends_) +
+                SerializedSize(axes_) + SerializedSize(ban_fp16_);

-void SlicePluginDynamic::serialize(void *buffer) const {}
+  return size;
+}
+
+void SlicePluginDynamic::serialize(void *buffer) const {
+  SerializeValue(&buffer, starts_);
+  SerializeValue(&buffer, ends_);
+  SerializeValue(&buffer, axes_);
+  SerializeValue(&buffer, ban_fp16_);
+}

 nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions(
    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
@@ -136,9 +323,9 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
  std::vector<int> offsets;
  std::vector<int> extends;

-  offsets.reserve(num_dims);
-  extends.reserve(num_dims);
-  seg_offsets.reserve(num_dims);
+  offsets.resize(num_dims);
+  extends.resize(num_dims);
+  seg_offsets.resize(num_dims);

  seg_offsets[num_dims - 1] = 1;
  for (int i = num_dims - 2; i >= 0; i--) {
@@ -160,16 +347,16 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
    offset_info.push_back(seg_offsets[i]);
  }

-  framework::Tensor offset_temp_tensor;
+  if (offset_temp_data_ == nullptr) {
+    cudaMalloc(&offset_temp_data_, 3 * num_dims * sizeof(int));
+  }

-  int device_id;
-  cudaGetDevice(&device_id);
-  offset_temp_tensor.Resize({3 * num_dims});
-  auto *offset_temp_data =
-      offset_temp_tensor.mutable_data<int>(platform::CUDAPlace(device_id));
+  cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
+                  copy_stream_);

-  cudaMemcpyAsync(offset_temp_data, offset_info.data(),
-                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice, stream);
+  cudaEventRecord(copy_event_, copy_stream_);
+  cudaStreamWaitEvent(stream, copy_event_, 0);

  int threads = 256;
  int blocks = (out_num + threads - 1) / threads;
@@ -178,13 +365,13 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
    const float *input1 = static_cast<const float *>(inputs[0]);
    float *output = static_cast<float *>(outputs[0]);
    SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
-        out_num, num_dims, input1, offset_temp_data, output);
+        out_num, num_dims, input1, offset_temp_data_, output);
  } else if (input_type == nvinfer1::DataType::kHALF) {
 #ifdef SUPPORTS_CUDA_FP16
    const half *input1 = static_cast<const half *>(inputs[0]);
    half *output = static_cast<half *>(outputs[0]);
    SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
-        out_num, num_dims, input1, offset_temp_data, output);
+        out_num, num_dims, input1, offset_temp_data_, output);
 #else
    PADDLE_THROW(platform::errors::Fatal(
        "The cuda archs you specific should greater than 600."));

--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
@@ -26,17 +26,56 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {

+class SlicePlugin : public PluginTensorRT {
+ public:
+  explicit SlicePlugin(std::vector<int> starts, std::vector<int> ends,
+                       std::vector<int> axes, bool ban_fp16);
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  SlicePlugin(void const* serial_data, size_t serial_length);
+  ~SlicePlugin();
+  SlicePlugin* clone() const override;
+
+  const char* getPluginType() const override { return "slice_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override { return 0; }
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::PluginFormat format) const override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+
+ protected:
+  size_t getSerializationSize() override;
+
+  // TRT will call this func  to serialize the configuration of TRT
+  // It should not be called by users.
+  void serialize(void* buffer) override;
+
+ private:
+  std::vector<int> starts_;
+  std::vector<int> ends_;
+  std::vector<int> axes_;
+  bool ban_fp16_{false};
+  int* offset_temp_data_{nullptr};
+  cudaEvent_t copy_event_;
+  cudaStream_t copy_stream_;
+};
+
 #if IS_TRT_VERSION_GE(6000)
 class SlicePluginDynamic : public DynamicPluginTensorRT {
 public:
  explicit SlicePluginDynamic(std::vector<int> starts, std::vector<int> ends,
-                              std::vector<int> axes, bool ban_fp16)
-      : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {}
-  SlicePluginDynamic(void const* serialData, size_t serialLength) {}
+                              std::vector<int> axes, bool ban_fp16);
+
  nvinfer1::IPluginV2DynamicExt* clone() const override {
    return new SlicePluginDynamic(starts_, ends_, axes_, ban_fp16_);
  }

+  SlicePluginDynamic(void const* serialData, size_t serialLength);
+
  const char* getPluginType() const override { return "slice_plugin"; }
  int getNbOutputs() const override { return 1; }
  int initialize() override;
@@ -72,15 +111,54 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
                                       const nvinfer1::DataType* inputTypes,
                                       int nbInputs) const override;

-  void destroy() override { delete this; }
+  void destroy() override;

 private:
  std::vector<int> starts_;
  std::vector<int> ends_;
  std::vector<int> axes_;
-
  bool ban_fp16_{false};
+  int* offset_temp_data_{nullptr};
+  cudaEvent_t copy_event_;
+  cudaStream_t copy_stream_;
 };
+
+class SlicePluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  SlicePluginV2Creator() {}
+  const char* getPluginName() const override { return "slice_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serialData,
+                                         size_t serialLength) override {
+    auto plugin = new SlicePluginDynamic(serialData, serialLength);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* libNamespace) override {
+    namespace_ = libNamespace;
+  }
+
+  const char* getPluginNamespace() const override { return namespace_.c_str(); }
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
+REGISTER_TRT_PLUGIN_V2(SlicePluginV2Creator);
+
 #endif

 }  // namespace plugin

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -480,10 +480,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
    endif()

-    # disable test_trt_dynamic_shape_ernie_ser_deser temporary
-    #inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
-    #        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-    #        ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
+    inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)

 endif()


--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -245,8 +245,14 @@ TEST(Analyzer_bert, transfer_scope_cache) {
  // Since paddle::framework::global_transfer_scope_cache() and
  // paddle::framework::global_transfer_data_cache() are thread_local,
  // their pointer should be different among different thread id.
-  PADDLE_ENFORCE(global_transfer_scope_cache.size(), threads_num);
-  PADDLE_ENFORCE(global_transfer_data_cache.size(), threads_num);
+  PADDLE_ENFORCE_EQ(
+      global_transfer_scope_cache.size(), threads_num,
+      paddle::platform::errors::Fatal(
+          "The size of scope cache is not equal to thread number."));
+  PADDLE_ENFORCE_EQ(
+      global_transfer_data_cache.size(), threads_num,
+      paddle::platform::errors::Fatal(
+          "The size of data cache is not equal to thread number."));
 }

 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
@@ -69,11 +69,13 @@ void PD_run() {
  PD_DeletePaddleTensor(input);
  int size;
  const int* out_shape = PD_GetPaddleTensorShape(out_data, &size);
-  CHECK(size == 2) << "The Output shape's size is NOT match.";
+  PADDLE_ENFORCE_EQ(size, 2, paddle::platform::errors::InvalidArgument(
+                                 "The Output shape's size is NOT match."));
  std::vector<int> ref_outshape_size({9, 6});
  for (int i = 0; i < 2; ++i) {
-    CHECK(out_shape[i] == ref_outshape_size[i])
-        << "The Output's shape is NOT match.";
+    PADDLE_ENFORCE_EQ(out_shape[i], ref_outshape_size[i],
+                      paddle::platform::errors::InvalidArgument(
+                          "The Output shape's size is NOT match."));
  }
  PD_DeletePaddleBuf(buf);
 }

--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -36,9 +36,9 @@ void zero_copy_run() {
  PD_SwitchIrDebug(config, true);
  PD_SetModel(config, prog_file.c_str(), params_file.c_str());
  bool use_feed_fetch = PD_UseFeedFetchOpsEnabled(config);
-  CHECK(!use_feed_fetch) << "NO";
+  EXPECT_FALSE(use_feed_fetch);
  bool specify_input_names = PD_SpecifyInputName(config);
-  CHECK(specify_input_names) << "NO";
+  EXPECT_TRUE(specify_input_names);

  const int batch_size = 1;
  const int channels = 3;
@@ -85,13 +85,13 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
  PD_SwitchIrDebug(config, true);
  PD_EnableMKLDNN(config);
  bool mkldnn_enable = PD_MkldnnEnabled(config);
-  CHECK(mkldnn_enable) << "NO";
+  EXPECT_TRUE(mkldnn_enable);
  PD_EnableMkldnnQuantizer(config);
  bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
-  CHECK(quantizer_enable) << "NO";
+  EXPECT_TRUE(quantizer_enable);
  PD_EnableMkldnnBfloat16(config);
  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
-  CHECK(bfloat16_enable) << "NO";
+  EXPECT_TRUE(bfloat16_enable);
  PD_SetMkldnnCacheCapacity(config, 0);
  PD_SetModel(config, prog_file.c_str(), params_file.c_str());
  PD_DeleteAnalysisConfig(config);

--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -126,7 +126,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  std::string turn_mask_pre = "turn_mask_";

  auto one_batch = data->NextBatch();
-  PADDLE_ENFORCE(!one_batch.response.empty());
+  PADDLE_ENFORCE(
+      !one_batch.response.empty(),
+      paddle::platform::errors::Fatal("The response of one batch is empty."));
  int size = one_batch.response[0].size();
  CHECK_EQ(size, kMaxTurnLen);
  // turn tensor assignment
@@ -214,11 +216,17 @@ void profile(bool use_mkldnn = false) {
                 input_slots_all, &outputs, FLAGS_num_threads);

  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of outputs should be greater than 0."));
    auto output = outputs.back();
-    PADDLE_ENFORCE_GT(output.size(), 0);
+    PADDLE_ENFORCE_GT(output.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    float *result = static_cast<float *>(output[0].data.data());
    for (size_t i = 0; i < size; i++) {
      EXPECT_NEAR(result[i], result_data[i], 1e-3);

--- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
@@ -146,8 +146,9 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
  auto iterations = test_data.size();
  PADDLE_ENFORCE_LE(
      static_cast<size_t>(num_images), iterations * test_data_batch_size,
+      paddle::platform::errors::Fatal(
          "The requested quantization warmup data size " +
-          std::to_string(num_images) + " is bigger than all test data size.");
+          std::to_string(num_images) + " is bigger than all test data size."));

  PaddleTensor images;
  images.name = "image";
@@ -237,8 +238,9 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
  }
  PADDLE_ENFORCE_EQ(
      static_cast<size_t>(num_objects), static_cast<size_t>(objects_accum),
-      "The requested num of objects " + std::to_string(num_objects) +
-          " is the same as objects_accum.");
+      paddle::platform::errors::Fatal("The requested num of objects " +
+                                      std::to_string(num_objects) +
+                                      " is the same as objects_accum."));

  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(4);
  (*warmup_data)[0] = std::move(images);

--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -98,7 +98,9 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  input_tensor.name = "word";
  input_tensor.dtype = PaddleDType::INT64;
  TensorAssignData<int64_t>(&input_tensor, {one_batch.data}, one_batch.lod);
-  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
+  PADDLE_ENFORCE_EQ(
+      batch_size, static_cast<int>(one_batch.lod.size() - 1),
+      paddle::platform::errors::Fatal("The lod size of one batch is invaild."));
  input_slots->assign({input_tensor});
 }

@@ -137,12 +139,17 @@ TEST(Analyzer_LAC, profile) {
        24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
        44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
        15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
    size_t size = GetSize(output[0]);
    size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
-    PADDLE_ENFORCE_GE(size, batch1_size);
+    PADDLE_ENFORCE_GE(size, batch1_size, paddle::platform::errors::Fatal(
+                                             "The size of batch is invaild."));
    int64_t *pdata = static_cast<int64_t *>(output[0].data.data());
    for (size_t i = 0; i < batch1_size; ++i) {
      EXPECT_EQ(pdata[i], lac_ref_data[i]);

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -117,11 +117,17 @@ void profile(bool memory_load = false) {
    // the first inference result
    const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                           48, 39, 38, 16, 25};
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
    size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    int64_t *result = static_cast<int64_t *>(output[0].data.data());
    for (size_t i = 0; i < std::min<size_t>(11, size); i++) {
      EXPECT_EQ(result[i], chinese_ner_result_data[i]);

--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -136,11 +136,17 @@ TEST(Analyzer_Pyramid_DNN, profile) {
                 input_slots_all, &outputs, FLAGS_num_threads);

  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
    size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    float *result = static_cast<float *>(output[0].data.data());
    // output is probability, which is in (0, 1).
    for (size_t i = 0; i < size; i++) {

--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -135,11 +135,17 @@ TEST(Analyzer_rnn2, profile) {

  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    // the first inference result
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    auto output = outputs.back();
-    PADDLE_ENFORCE_GT(output.size(), 0);
+    PADDLE_ENFORCE_GT(output.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    float *result = static_cast<float *>(output[0].data.data());
    for (size_t i = 0; i < size; i++) {
      EXPECT_NEAR(result[i], result_data[i], 1e-3);

--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -47,7 +47,8 @@ struct DataRecord {
      num_lines++;
      std::vector<std::string> data;
      split(line, '\t', &data);
-      PADDLE_ENFORCE(data.size() >= 4);
+      PADDLE_ENFORCE_GT(data.size(), 4, paddle::platform::errors::Fatal(
+                                            "The size of data is invaild."));
      // load title1 data
      std::vector<int64_t> title1_data;
      split_to_int64(data[0], ' ', &title1_data);
@@ -120,11 +121,17 @@ TEST(Analyzer_seq_conv1, profile) {

  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    // the first inference result
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 0."));
    size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    float *result = static_cast<float *>(output[0].data.data());
    // output is probability, which is in (0, 1).
    for (size_t i = 0; i < size; i++) {

--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -56,20 +56,26 @@ struct DataRecord {
      std::vector<float> slot_data;
      split_to_float(data[1], ' ', &slot_data);
      std::string name = data[0];
-      PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL,
-                        "line %d, %s should be divisible", num_lines, name);
+      PADDLE_ENFORCE_EQ(
+          slot_data.size() % 11, 0UL,
+          paddle::platform::errors::Fatal("line %d, %s should be divisible",
+                                          num_lines, name));
      datasets[name].emplace_back(std::move(slot_data));
    }
    num_samples = num_lines / num_slots;
-    PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines),
-                      "num samples should be divisible");
-    PADDLE_ENFORCE_GT(num_samples, 0UL);
+    PADDLE_ENFORCE_EQ(
+        num_samples * num_slots, static_cast<size_t>(num_lines),
+        paddle::platform::errors::Fatal("num samples should be divisible"));
+    PADDLE_ENFORCE_GT(num_samples, 0UL,
+                      paddle::platform::errors::Fatal(
+                          "The num of samples should be greater than 0."));
  }

  void Prepare(int bs) {
    for (auto it = datasets.begin(); it != datasets.end(); ++it) {
-      PADDLE_ENFORCE_EQ(it->second.size(), num_samples,
-                        "size of each slot should be equal");
+      PADDLE_ENFORCE_EQ(
+          it->second.size(), num_samples,
+          paddle::platform::errors::Fatal("size of each slot should be equal"));
    }
    size_t num_batches = num_samples / bs;
    EXPECT_GT(num_batches, 0UL);
@@ -90,8 +96,10 @@ struct DataRecord {
          std::copy(datas[id].begin(), datas[id].end(),
                    std::back_inserter(slot.data[k]));
          size_t len = datas[id].size() / 11;
-          PADDLE_ENFORCE_EQ(len * 11, datas[id].size(),
-                            "%s %d size should be divisible", slot.name, id);
+          PADDLE_ENFORCE_EQ(
+              len * 11, datas[id].size(),
+              paddle::platform::errors::Fatal("%s %d size should be divisible",
+                                              slot.name, id));
          lod[k + 1] = lod[k] + len;
        }
        slot.shape.assign({static_cast<int>(lod[bs]), 11});

--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -22,7 +22,9 @@ struct DataReader {
      : file(new std::ifstream(path)) {}

  bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
-    PADDLE_ENFORCE_EQ(batch_size, 1);
+    PADDLE_ENFORCE_EQ(batch_size, 1,
+                      paddle::platform::errors::Fatal(
+                          "The size of batch should be equal to 1."));
    std::string line;
    PaddleTensor tensor;
    tensor.dtype = PaddleDType::INT64;
@@ -81,7 +83,9 @@ TEST(Analyzer_Text_Classification, profile) {

  if (FLAGS_num_threads == 1) {
    // Get output
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    LOG(INFO) << "get outputs " << outputs.back().size();
    for (auto &output : outputs.back()) {
      LOG(INFO) << "output.shape: " << to_string(output.shape);

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -59,7 +59,9 @@ void SetConfig(AnalysisConfig *cfg) {
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
+  PADDLE_ENFORCE_EQ(
+      FLAGS_test_all_data, 0,
+      paddle::platform::errors::Fatal("Only have single batch of data."));
  std::string line;
  std::ifstream file(FLAGS_infer_data);
  std::getline(file, line);
@@ -99,7 +101,9 @@ void profile(bool use_mkldnn = false) {
    auto refer = ProcessALine(line);
    file.close();

-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
    auto &output = outputs.back().front();
    size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
    CHECK_EQ(numel, refer.data.size());

--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -12,15 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <dirent.h>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <unistd.h>

 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"

 namespace paddle {
 namespace inference {

+int DeleteCache(std::string path) {
+  DIR* dir = opendir(path.c_str());
+  if (dir == NULL) return 0;
+  struct dirent* ptr;
+  while ((ptr = readdir(dir)) != NULL) {
+    if (std::strcmp(ptr->d_name, ".") == 0 ||
+        std::strcmp(ptr->d_name, "..") == 0) {
+      continue;
+    } else if (ptr->d_type == 8) {
+      std::string file_rm = path + "/" + ptr->d_name;
+      return remove(file_rm.c_str());
+    }
+  }
+  return 0;
+}
+
 void run(const AnalysisConfig& config, std::vector<float>* out_data) {
  auto predictor = CreatePaddlePredictor(config);
  auto input_names = predictor->GetInputNames();
@@ -86,6 +104,11 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) {
 void trt_ernie(bool with_fp16, std::vector<float> result) {
  AnalysisConfig config;
  std::string model_dir = FLAGS_infer_model;
+  // Delete serialization cache to perform serialization first rather than
+  // deserialization.
+  std::string opt_cache_dir = FLAGS_infer_model + "/_opt_cache";
+  DeleteCache(opt_cache_dir);
+
  SetConfig(&config, model_dir, true /* use_gpu */);

  config.SwitchUseFeedFetchOps(false);

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -21,6 +21,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -162,7 +163,8 @@ void TestInference(const std::string& dirname,
    //   int device_id = place.GetDeviceId();
    paddle::platform::SetDeviceId(0);
 #else
-    PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "'CUDAPlace' is not supported in CPU only device."));
 #endif
  }


--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -16,6 +16,7 @@
 #include <random>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
@@ -41,12 +42,14 @@ TEST(BestFitAllocator, concurrent_cuda) {
  LockedAllocator concurrent_allocator(
      std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));

+  platform::CUDAPlace gpu(0);
+  platform::CUDADeviceContext dev_ctx(gpu);
+
  auto th_main = [&](std::random_device::result_type seed) {
    std::default_random_engine engine(seed);
    std::uniform_int_distribution<size_t> dist(1U, 1024U);
-    platform::CUDAPlace gpu(0);
-    platform::CUDADeviceContext dev_ctx(gpu);
    std::array<size_t, 1024> buf;
+
    for (size_t i = 0; i < 128; ++i) {
      size_t allocate_size = dist(engine);


--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -110,10 +110,12 @@ struct VisitDataArgMinMaxFunctor {
        CALL_ARG_MINMAX_FUNCTOR(6);
        break;
      default:
-        PADDLE_THROW(
+        PADDLE_ENFORCE_LE(
+            x_dims.size(), 6,
+            platform::errors::InvalidArgument(
                "%s operator doesn't supports tensors whose ranks are greater "
                "than 6.",
-            (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
+                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
        break;
 #undef CALL_ARG_MINMAX_FUNCTOR
    }
@@ -164,7 +166,8 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_LT(
        axis, x_dims.size(),
        platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
+            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis,
+            x_dims.size()));

    const int& dtype = ctx->Attrs().Get<int>("dtype");
    PADDLE_ENFORCE_EQ(
@@ -192,10 +195,11 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
        }
        PADDLE_ENFORCE_LE(
            all_element_num, INT_MAX,
+            platform::errors::InvalidArgument(
                "The element num of the argmin/argmax input at axis is "
                "%d, is larger than int32 maximum value:%d, you must "
                "set the dtype of argmin/argmax to 'int64'.",
-            all_element_num, INT_MAX);
+                all_element_num, INT_MAX));
      }
    }
    std::vector<int64_t> vec;

--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -52,7 +52,10 @@ class AssignFunctor {

  template <typename T>
  void operator()(const T &v) const {
-    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+    PADDLE_ENFORCE_EQ(
+        true, false,
+        platform::errors::PermissionDenied(
+            "Not support type for assign op with type %s", typeid(T).name()));
  }

 private:

--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+
+namespace paddle {
+namespace operators {
+
+class ScopedRNNBase {
+ public:
+  ScopedRNNBase(int seq_length, int batch_size, int input_size, int hidden_size,
+                int num_layers, float dropout_prob, int seed, int weight_numel,
+                bool initialized, bool is_bidirec)
+      : seq_length_(seq_length),
+        batch_size_(batch_size),
+        input_size_(input_size),
+        hidden_size_(hidden_size),
+        num_layers_(num_layers),
+        dropout_prob_(dropout_prob),
+        seed_(seed),
+        weight_numel_(weight_numel),
+        initialized_(initialized),
+        is_bidirec_(is_bidirec) {}
+
+  template <typename T>
+  void Create(const cudnnHandle_t& handle, const platform::Place& place,
+              const std::vector<int>& sequence_length, size_t* workspace_size,
+              size_t* reserve_size, framework::Tensor* dropout_state) {
+    int numDirections = is_bidirec_ ? 2 : 1;
+    cudnnDataType_t cudnn_type = platform::CudnnDataType<T>::type;
+
+    // ------------------- cudnn x, y descriptors ---------------------
+    std::vector<int> dims_x = {batch_size_, input_size_, 1};
+    std::vector<int> strides_x = {input_size_, 1, 1};
+    std::vector<int> dims_y = {batch_size_, hidden_size_ * numDirections, 1};
+    std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
+    for (int i = 0; i < seq_length_; ++i) {
+      x_descs_.emplace_back(x_desc_.descriptor<T>(dims_x, strides_x));
+      y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y));
+    }
+    if (!sequence_length.empty()) {
+      x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true,
+                                sequence_length);
+      y_seq_desc_.descriptor<T>(seq_length_, batch_size_,
+                                hidden_size_ * numDirections, true,
+                                sequence_length);
+    }
+
+    // ------------------- cudnn hx, hy, cx, cy descriptors----------
+    std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
+                                hidden_size_};
+    std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
+    init_h_desc_.descriptor<T>(dims_hx, strides_hx);
+    init_c_desc_.descriptor<T>(dims_hx, strides_hx);
+    last_h_desc_.descriptor<T>(dims_hx, strides_hx);
+    last_c_desc_.descriptor<T>(dims_hx, strides_hx);
+
+    // ------------------- cudnn dropout descriptors ---------------------
+    size_t state_size;
+    if (!initialized_) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
+      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
+                                           place);
+    }
+    dropout_desc_.descriptor(handle, place, initialized_, dropout_prob_,
+                             dropout_state, seed_, state_size);
+
+// ------------------- cudnn rnn descriptors ---------------------
+#if CUDNN_VERSION >= 6000
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+        handle, rnn_desc_.desc(), hidden_size_, num_layers_,
+        dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_RNN_ALGO_STANDARD, cudnn_type));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
+        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
+        CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        cudnn_type));
+#endif
+    if (!sequence_length.empty()) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
+          rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
+    }
+
+    // ------------------- cudnn weights_size ---------------------
+    size_t weights_size_;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+        handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
+    PADDLE_ENFORCE_EQ(
+        weights_size_, sizeof(T) * weight_numel_,
+        platform::errors::InvalidArgument(
+            "The cudnn lstm and setting weight size should be same."));
+    // ------------------- cudnn weight descriptors ---------------------
+    platform::DataLayout layout = platform::DataLayout::kNCHW;
+    int dim_tmp = weights_size_ / sizeof(T);
+    std::vector<int> dim_w = {dim_tmp, 1, 1};
+    weight_desc_.descriptor<T>(layout, dim_w);
+    // ------------------- cudnn workspace, reserve size ---------------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+        handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
+        workspace_size));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetRNNTrainingReserveSize(
+            handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
+            reserve_size));
+  }
+  cudnnTensorDescriptor_t* x_descs() { return x_descs_.data(); }
+  cudnnTensorDescriptor_t* y_descs() { return y_descs_.data(); }
+  cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_.desc(); }
+  cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_.desc(); }
+  cudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
+  cudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
+  cudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
+  cudnnTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); }
+  cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
+  cudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
+  cudnnFilterDescriptor_t weight_desc() { return weight_desc_.desc(); }
+
+ private:
+  int seq_length_;
+  int batch_size_;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  float dropout_prob_;
+  int seed_;
+  int weight_numel_;
+  bool initialized_;
+  bool is_bidirec_;
+  std::vector<cudnnTensorDescriptor_t> x_descs_;
+  std::vector<cudnnTensorDescriptor_t> y_descs_;
+
+  platform::ScopedTensorDescriptor x_desc_;
+  platform::ScopedTensorDescriptor y_desc_;
+  platform::ScopedRNNTensorDescriptor x_seq_desc_;
+  platform::ScopedRNNTensorDescriptor y_seq_desc_;
+  platform::ScopedTensorDescriptor init_h_desc_;
+  platform::ScopedTensorDescriptor init_c_desc_;
+  platform::ScopedTensorDescriptor last_h_desc_;
+  platform::ScopedTensorDescriptor last_c_desc_;
+  platform::ScopedDropoutDescriptor dropout_desc_;
+  platform::ScopedFilterDescriptor weight_desc_;
+  platform::ScopedRNNDescriptor rnn_desc_;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -51,6 +51,16 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
                          "received InitH's rank is %d.",
                          init_h_dims.size()));

+    if (ctx->HasInput("SequenceLength")) {
+      auto seq_dims = ctx->GetInputDim("SequenceLength");
+      PADDLE_ENFORCE_EQ(
+          in_dims[1], seq_dims[0],
+          platform::errors::InvalidArgument(
+              "The size of SequenceLength has to equal the batch_size. But "
+              "received batch_size is %d and the size of SequenceLength is %d.",
+              in_dims[1], seq_dims[0]));
+    }
+
    PADDLE_ENFORCE_EQ(
        in_dims[1], init_h_dims[1],
        platform::errors::InvalidArgument(
@@ -113,6 +123,12 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
             "(Tensor) the learnable hidden-hidden weights."
             " The shape is (N), where N is total weight size of the LSTM. "
             " cudnn concatenate all the weight to one Tensor");
+    AddInput("SequenceLength",
+             "(Tensor) When the input data is padding, "
+             "set this parameter. This parameter represents "
+             "the variable sequence lengths in a batch. "
+             "The size of the vector has to equal the batch_size.")
+        .AsDispensable();
    AddOutput("Reserve",
              "(Tensor, a temporary output Tensor to store the reserve_data "
              "of cudnn kernel.")
@@ -155,13 +171,6 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(1);
    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
-    AddAttr<std::vector<int>>("sequence_length",
-                              "(vector<int>) When the input data is padding, "
-                              "set this parameter. This parameter represents "
-                              "the variable sequence"
-                              "lengths in a batch. The size of the vector has "
-                              "to equal the batch_size.")
-        .SetDefault({});
    AddComment(R"DOC(
 CUDNN LSTM implementation

@@ -243,6 +252,9 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
    op->SetInput("InitH", this->Input("InitH"));
    op->SetInput("InitC", this->Input("InitC"));
    op->SetInput("W", this->Input("W"));
+    if (this->HasInput("SequenceLength")) {
+      op->SetInput("SequenceLength", this->Input("SequenceLength"));
+    }
    op->SetInput("Reserve", this->Output("Reserve"));
    op->SetInput("StateOut", this->Output("StateOut"));
    op->SetInput("Out", this->Output("Out"));

--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/cudnn_rnn_cache.h"
+#include "paddle/fluid/operators/cudnn_lstm_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
 #include "paddle/fluid/platform/cudnn_helper.h"

@@ -24,6 +25,43 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;

+template <typename T>
+void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
+                  const int &seq_length, ScopedRNNBase *rnn, const T *x_data,
+                  const T *init_h_data, const T *init_c_data, const T *w_data,
+                  T *out_data, T *last_h_data, T *last_c_data,
+                  framework::Tensor *workspace_data,
+                  const size_t &workspace_size) {
+  if (!has_seq_length) {
+    // for inference
+    // This interface is used when the input/output is unpadded.
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
+        handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
+        rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
+        rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
+        rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
+        workspace_data->data<uint8_t>(), workspace_size));
+  } else {
+#if CUDNN_VERSION >= 7201
+    // for inference
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
+        handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data, rnn->init_h_desc(),
+        init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(),
+        w_data, rnn->y_seq_desc(), out_data, rnn->last_h_desc(), last_h_data,
+        rnn->last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
+        nullptr, nullptr, nullptr, nullptr, workspace_data->data<uint8_t>(),
+        workspace_size));
+#else
+    // CUDNN VERSION has to >=7.2.1
+    PADDLE_THROW(platform::errors::Unavailable(
+        "The padded input is supported by "
+        "cudnnRNNForwardInferenceEx, but it only works when "
+        "the version of cudnn is larger than 7.2.1"));
+#endif
+  }
+}
+
 template <typename T>
 class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
 public:
@@ -56,7 +94,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
    int num_layers = ctx.Attr<int>("num_layers");
    bool is_test = ctx.Attr<bool>("is_test");
    int seed = ctx.Attr<int>("seed");
-    auto sequence_length = ctx.Attr<std::vector<int>>("sequence_length");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    std::vector<int> SequenceLength;
+    if (has_seq_length) {
+      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
+    }

    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
@@ -70,58 +114,32 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
    size_t workspace_size;
    size_t reserve_size;

-    platform::ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
+    ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
                      num_layers, dropout_prob, seed, weight_numel,
                      state_initialized, is_bidirec);
-    rnn.Create<T>(handle, ctx.GetPlace(), sequence_length, &workspace_size,
+    rnn.Create<T>(handle, ctx.GetPlace(), SequenceLength, &workspace_size,
                  &reserve_size, state_out);

    framework::Tensor workspace_data_;
-    workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
-    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
+    workspace_data_.mutable_data<uint8_t>(
+        {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());

    auto *reserve_data = reserve->mutable_data<uint8_t>(
        {static_cast<int64_t>(reserve_size)}, ctx.GetPlace());

    if (is_test) {
-      if (sequence_length.empty()) {
-        // for inference
-        // This interface is used when the input/output is unpadded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
-            handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), x_data,
-            rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
-            rnn.w_desc(), w_data, rnn.y_desc(), out_data, rnn.hy_desc(),
-            last_h_data, rnn.cy_desc(), last_c_data,
-            workspace_data_.data<uint8_t>(), workspace_size));
+      LSTMInferece<T>(has_seq_length, handle, seq_length, &rnn, x_data,
+                      init_h_data, init_c_data, w_data, out_data, last_h_data,
+                      last_c_data, &workspace_data_, workspace_size);
    } else {
-#if CUDNN_VERSION >= 7201
-        // for inference
-        // This interface is used when the input/output is padded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnRNNForwardInferenceEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.hx_desc(),
-                init_h_data, rnn.cx_desc(), init_c_data, rnn.w_desc(), w_data,
-                rnn.y_seq_desc(), out_data, rnn.hy_desc(), last_h_data,
-                rnn.cy_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
-                nullptr, nullptr, nullptr, nullptr,
-                workspace_data_.data<uint8_t>(), workspace_size));
-#else
-        PADDLE_ENFORCE_NOT_NULL(
-            nullptr, platform::errors::Unavailable(
-                         "The padded input is supported by "
-                         "cudnnRNNForwardInferenceEx, but it only works when "
-                         "the version of cudnn is larger than 7.2.1"));
-#endif
-      }
-    } else {
-      if (sequence_length.empty()) {
+      if (!has_seq_length) {
        // for train
        // This interface is used when the input/output is unpadded.
        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
-            handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), x_data,
-            rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
-            rnn.w_desc(), w_data, rnn.y_desc(), out_data, rnn.hy_desc(),
-            last_h_data, rnn.cy_desc(), last_c_data,
+            handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
+            rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+            rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
+            rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
            workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
            reserve_size));
      } else {
@@ -130,16 +148,15 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
        // This interface is used when the input/output is padded.
        PADDLE_ENFORCE_CUDA_SUCCESS(
            platform::dynload::cudnnRNNForwardTrainingEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.hx_desc(),
-                init_h_data, rnn.cx_desc(), init_c_data, rnn.w_desc(), w_data,
-                rnn.y_seq_desc(), out_data, rnn.hy_desc(), last_h_data,
-                rnn.cy_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
-                nullptr, nullptr, nullptr, nullptr,
-                workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
-                reserve_size));
+                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data,
+                rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+                rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data,
+                rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, workspace_data_.data<uint8_t>(), workspace_size,
+                reserve_data, reserve_size));
 #else
-        PADDLE_ENFORCE_NOT_NULL(
-            nullptr, platform::errors::Unavailable(
+        PADDLE_THROW(platform::errors::Unavailable(
            "The padded input is supported by "
            "cudnnRNNForwardTrainingEx, but it only works when "
            "the version of cudnn is larger than 7.2.1"));
@@ -203,7 +220,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
    int hidden_size = ctx.Attr<int>("hidden_size");
    int num_layers = ctx.Attr<int>("num_layers");
    int seed = ctx.Attr<int>("seed");
-    auto sequence_length = ctx.Attr<std::vector<int>>("sequence_length");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    std::vector<int> SequenceLength;
+    if (has_seq_length) {
+      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
+    }

    int seq_length = input_dims[0];
    int batch_size = input->dims()[1];
@@ -213,33 +236,33 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
    size_t workspace_size;
    size_t reserve_size;

-    platform::ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
-                                num_layers, dropout_prob, seed, weight_numel,
-                                true, is_bidirec);
+    ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
+                      num_layers, dropout_prob, seed, weight_numel, true,
+                      is_bidirec);

-    rnn.Create<T>(handle, ctx.GetPlace(), sequence_length, &workspace_size,
+    rnn.Create<T>(handle, ctx.GetPlace(), SequenceLength, &workspace_size,
                  &reserve_size, const_cast<Tensor *>(state_out));

    framework::Tensor workspace_data_;
-    workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
-    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
+    workspace_data_.mutable_data<uint8_t>(
+        {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());
    const uint8_t *reserve_data = reserve->data<uint8_t>();

-    if (sequence_length.empty()) {
+    if (!has_seq_length) {
      // This interface is used when the input/output is unpadded.
      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
-          handle, rnn.rnn_desc(), seq_length, rnn.y_desc(), out_data,
-          rnn.y_desc(), out_grad_data, rnn.hy_desc(), last_h_grad_data,
-          rnn.cy_desc(), last_c_grad_data, rnn.w_desc(), weight_data,
-          rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data, rnn.x_desc(),
-          in_grad_data, rnn.hx_desc(), init_h_grad_data, rnn.cx_desc(),
-          init_c_grad_data, workspace_data_.data<uint8_t>(), workspace_size,
-          const_cast<uint8_t *>(reserve_data), reserve_size));
+          handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
+          rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
+          rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
+          rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+          rnn.x_descs(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
+          rnn.init_c_desc(), init_c_grad_data, workspace_data_.data<uint8_t>(),
+          workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));

      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
-          handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), input->data<T>(),
-          rnn.hx_desc(), init_h->data<T>(), rnn.y_desc(), out->data<T>(),
-          workspace_data_.data<uint8_t>(), workspace_size, rnn.w_desc(),
+          handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
+          rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
+          workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
          weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
          reserve_size));
    } else {
@@ -248,24 +271,22 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
      // This interface is used when the input/output is padded.
      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
          handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data, rnn.y_seq_desc(),
-          out_grad_data, nullptr, nullptr, rnn.hy_desc(), last_h_grad_data,
-          rnn.cy_desc(), last_c_grad_data, rnn.w_desc(), weight_data,
-          rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
-          rnn.x_seq_desc(), in_grad_data, rnn.hx_desc(), init_h_grad_data,
-          rnn.cx_desc(), init_c_grad_data, nullptr, nullptr,
+          out_grad_data, nullptr, nullptr, rnn.last_h_desc(), last_h_grad_data,
+          rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
+          rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+          rnn.x_seq_desc(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
+          rnn.init_c_desc(), init_c_grad_data, nullptr, nullptr,
          workspace_data_.data<uint8_t>(), workspace_size,
          const_cast<uint8_t *>(reserve_data), reserve_size));

      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
          handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
-          rnn.hx_desc(), init_h->data<T>(), rnn.y_seq_desc(), out->data<T>(),
-          workspace_data_.data<uint8_t>(), workspace_size, rnn.w_desc(),
-          weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
-          reserve_size));
+          rnn.init_h_desc(), init_h->data<T>(), rnn.y_seq_desc(),
+          out->data<T>(), workspace_data_.data<uint8_t>(), workspace_size,
+          rnn.weight_desc(), weight_grad->data<T>(),
+          const_cast<uint8_t *>(reserve_data), reserve_size));
 #else
-      PADDLE_ENFORCE_NOT_NULL(
-          nullptr,
-          platform::errors::Unavailable(
+      PADDLE_THROW(platform::errors::Unavailable(
          "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
          "cudnnRNNBackwardWeightsEx, but it only works when the version "
          "of cudnn is larger than 7.2.1"));

--- a/paddle/fluid/operators/distributed_ops/allreduce_op.h
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.h
@@ -76,7 +76,8 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
    }
 #else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
 #endif
  }
 };

--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cc
+++ b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
@@ -58,7 +58,8 @@ template <typename T>
 class BroadcastOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("Broadcast op can run on gpu place only for now.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Broadcast op can run on gpu place only for now."));
  }
 };


--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
@@ -68,10 +68,11 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
            << " From " << root_dev_id << " to " << dev_id;

    if (ctx.Attr<bool>("sync_mode")) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
    }
 #else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
 #endif
  }
 };

--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -33,9 +33,12 @@ namespace operators {
 static void Memcpy(void *dst, const void *src, size_t n, bool copy_to_gpu) {
  if (copy_to_gpu) {
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE(cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
 #else
-    PADDLE_THROW("Not compiled with cuda");
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("Check your paddle version, current "
+                                          "version is not compiled with cuda"));
 #endif
  } else {
    std::memcpy(dst, src, n);
@@ -88,11 +91,22 @@ bool TestMain(const platform::Place &place, const framework::DDim &dims,

  framework::LoDTensor cpu_out;
  auto &out_tensor = scope.FindVar(out_name)->Get<framework::LoDTensor>();
-  PADDLE_ENFORCE(scope.kids().empty());
+  PADDLE_ENFORCE_EQ(scope.kids().empty(), true,
+                    platform::errors::InvalidArgument(
+                        "The scope can not have the child scopes,"
+                        "please check your code."));
  if (inplace) {
-    PADDLE_ENFORCE_EQ(&out_tensor, x);
+    PADDLE_ENFORCE_EQ(
+        &out_tensor, x,
+        platform::errors::InvalidArgument(
+            "The output tensor should be same as input x in inplace mode,"
+            " but now is not same."));
  } else {
-    PADDLE_ENFORCE_EQ(&out_tensor, z);
+    PADDLE_ENFORCE_EQ(
+        &out_tensor, z,
+        platform::errors::InvalidArgument(
+            "The output tensor should be same as output z in normal mode,"
+            " but now is not same."));
  }

  if (is_gpu_place) {

--- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
+++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
@@ -92,7 +92,9 @@ class TestElementwiseOpGradGrad {
        auto dst_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
        memory::Copy(dst_place, dst, src_place, src, bytes, nullptr);
 #else
-        PADDLE_THROW("Not compiled with cuda");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Check your paddle version, current version is not compiled with "
+            "cuda"));
 #endif
      }
    }
@@ -107,7 +109,10 @@ class TestElementwiseOpGradGrad {
    op->Run(scope_, place_);
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
    framework::LoDTensor cpu_out;
-    PADDLE_ENFORCE_EQ(scope_.kids().empty(), true, "scope has child scopes");
+    PADDLE_ENFORCE_EQ(scope_.kids().empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The scope can not have the child scopes,"
+                          "please check your code."));

    // get outputs from scope and compare them with expected_outs
    bool all_equal = true;

--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -37,8 +37,21 @@ class GatherOp : public framework::OperatorWithKernel {
                          "Output(Out) of GatherOp should not be null."));

    auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE(index_dims.size() == 1 ||
-                   (index_dims.size() == 2 && index_dims[1] == 1));
+
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+
    int batch_size = ctx->GetInputDim("Index")[0];
    framework::DDim output_dims(ctx->GetInputDim("X"));
    output_dims[0] = batch_size;

--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -43,7 +43,11 @@ class OverflowOp : public framework::OperatorWithKernel {
    } else if (x_var->IsType<framework::SelectedRows>()) {
      dtype = x_var->Get<framework::SelectedRows>().value().type();
    } else {
-      PADDLE_THROW("Cannot find the input data type by all input data");
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The input type mismatch, the type of Input(X) must be Tensor or "
+              "SelectedRows, please check your input."));
    }
    return framework::OpKernelType(framework::proto::VarType::Type(dtype),
                                   ctx.GetPlace());

--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -57,7 +57,11 @@ class OverflowKernel : public framework::OpKernel<T> {
      auto& in = ctx.Input<framework::SelectedRows>("X")->value();
      functor(in, out);
    } else {
-      PADDLE_THROW("Unsupported input type.");
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The input type mismatch, the type of Input(X) must be Tensor or "
+              "SelectedRows, please check your input."));
    }
  }
 };

--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -22,8 +22,6 @@ class LinspaceOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Start"),
-                   "Input(Start) of LinspaceOp should not be null.");
    OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
    OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
    OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");

--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -63,7 +63,10 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
    framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
    int32_t num = n.data<int32_t>()[0];

-    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
+                                  "The num of linspace op should be larger "
+                                  "than 0, but received num is %d",
+                                  num));

    out->Resize(framework::make_ddim({num}));
    T* out_data = out->mutable_data<T>(context.GetPlace());

--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -46,7 +46,10 @@ class CPULinspaceKernel : public framework::OpKernel<T> {

    T start = start_t.data<T>()[0];
    T stop = stop_t.data<T>()[0];
-    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
+                                  "The num of linspace op should be larger "
+                                  "than 0, but received num is %d",
+                                  num));

    out->Resize(framework::make_ddim({num}));


--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -48,6 +48,7 @@ class QuantOpKernel : public framework::OpKernel<T> {
    const T* input_data = input->data<T>();

    bool is_negative = ctx.Attr<bool>("is_negative_input");
+    bool bfloat16 = ctx.Attr<bool>("bfloat16");
    std::string key =
        platform::CreateKey(platform::ThreadIDasStr(), src_tz, scale_data,
                            is_negative, ctx.OutputName("Output"));
@@ -74,7 +75,10 @@ class QuantOpKernel : public framework::OpKernel<T> {
          src_md, engine, to_void_cast<T>(input_data));

      std::shared_ptr<mkldnn::memory::desc> dst_md;
-      if (is_negative) {
+      if (bfloat16) {
+        platform::SetDstMemoryQuantized<paddle::platform::bfloat16>(
+            ctx, output, dst_tz, engine, dst_md, dst_memory, out_format);
+      } else if (is_negative) {
        platform::SetDstMemoryQuantized<int8_t>(ctx, output, dst_tz, engine,
                                                dst_md, dst_memory, out_format);
      } else {
@@ -96,7 +100,11 @@ class QuantOpKernel : public framework::OpKernel<T> {
      dst_memory = std::static_pointer_cast<mkldnn::memory>(
          dev_ctx.GetBlob(key_dst_mem));
      auto place = ctx.GetPlace();
-      if (is_negative) {
+
+      if (bfloat16) {
+        dst_memory->set_data_handle(
+            output->mutable_data<paddle::platform::bfloat16>(place));
+      } else if (is_negative) {
        dst_memory->set_data_handle(output->mutable_data<int8_t>(place));
      } else {
        dst_memory->set_data_handle(output->mutable_data<uint8_t>(place));

--- a/paddle/fluid/operators/quantize_op.cc
+++ b/paddle/fluid/operators/quantize_op.cc
@@ -40,6 +40,8 @@ void QuantOpMaker::Make() {
  AddAttr<std::string>("output_format",
                       "Convert format to NHWC or NCHW during quantization.")
      .SetDefault("NHWC");
+  AddAttr<bool>("bfloat16", "(bool, default false) Convert to bfloat16")
+      .SetDefault(false);
  AddComment(R"DOC(This op will quantize data from FP32 to INT8)DOC");
 }


--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -60,7 +60,10 @@ class ScaleKernel : public framework::OpKernel<T> {
    out->mutable_data<T>(in->place());

    PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
-                      "in and out should have the same dim");
+                      paddle::platform::errors::InvalidArgument(
+                          "the input and output should have the same dim"
+                          "but input dim is %s, output dim is %s",
+                          in->dims(), out->dims()));

    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
    auto eigen_in = framework::EigenVector<T>::Flatten(*in);

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -186,10 +186,17 @@ class SumOp : public framework::OperatorWithKernel {
          }
        }
      }
-      PADDLE_THROW("Cannot find the input data type by all input data");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected each tensor in Input(x) in sum op has be initialized, but "
+          "some tensor in Input(x) is not be initialized, please check your "
+          "code.",
+          framework::ToTypeName(x_vars[0]->Type())));
    }
-    PADDLE_THROW("Unexpected branch. Input type is %s",
-                 framework::ToTypeName(x_vars[0]->Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Expected type of Input(X) must be Tensor,  SelectedRows or "
+        "LodTensorArray. But got "
+        "unsupport type: %s.",
+        framework::ToTypeName(x_vars[0]->Type())));
  }
 };


--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -169,8 +169,18 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
      auto row_numel = sr_value.numel() / sr_rows.size();
      auto out_dims = out->dims();

-      PADDLE_ENFORCE_EQ(sr.height(), out_dims[0]);
-      PADDLE_ENFORCE_EQ(row_numel, out->numel() / sr.height());
+      PADDLE_ENFORCE_EQ(sr.height(), out_dims[0],
+                        platform::errors::InvalidArgument(
+                            "The table height of input must be same as output, "
+                            "but received input height is %d"
+                            ", output height is %d",
+                            sr.height(), out_dims[0]));
+      PADDLE_ENFORCE_EQ(row_numel, out->numel() / sr.height(),
+                        platform::errors::InvalidArgument(
+                            "The table width of input must be same as output, "
+                            "but received input width is %d"
+                            ", output width is %d",
+                            row_numel, out->numel() / sr.height()));

      auto *sr_data = sr_value.data<T>();
      auto *sr_out_data = out->data<T>();
@@ -231,8 +241,11 @@ class SumKernel<platform::CUDADeviceContext, T>
    } else if (out_var->IsType<framework::LoDTensorArray>()) {
      LodTensorArrayCompute<platform::CUDADeviceContext, T>(context);
    } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Ouput(out) must be Tensor,  SelectedRows or "
+          "LodTensorArray. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
    }
  }
 };

--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -182,7 +182,11 @@ class SumKernel : public framework::OpKernel<T> {
          auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
          functor(context.template device_context<DeviceContext>(), in_t, out);
        } else {
-          PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Expected type of Input(X) of %d-th must be Tensor, "
+              "SelectedRows. But got "
+              "unsupport type: %s.",
+              framework::ToTypeName(in_vars[i]->Type())));
        }
      }
    } else if (out_var->IsType<framework::SelectedRows>()) {
@@ -190,8 +194,11 @@ class SumKernel : public framework::OpKernel<T> {
    } else if (out_var->IsType<framework::LoDTensorArray>()) {
      LodTensorArrayCompute<DeviceContext, T>(context);
    } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) must be Tensor, SelectedRows, "
+          "LoDTensorArray. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
    }
  }
 };

--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -54,9 +54,11 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
      tensor = out_var->GetMutable<framework::LoDTensor>();
      if (!new_shape.empty()) tensor->Resize(framework::make_ddim(new_shape));
    } else {
-      PADDLE_THROW(
-          "uniform_random_op's output only"
-          "supports SelectedRows and LoDTensor");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "SelectedRows. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
    }
    T *data = tensor->mutable_data<T>(ctx.GetPlace());


--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/compatible.cc
+++ b/paddle/fluid/pybind/compatible.cc
--- a/paddle/fluid/pybind/compatible.h
+++ b/paddle/fluid/pybind/compatible.h
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
--- a/python/paddle/distributed/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
--- a/python/paddle/fluid/tests/unittests/test_broadcast_error.py
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_error.py
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
--- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_monitor.py
--- a/python/paddle/fluid/tests/unittests/test_regularizer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
--- a/python/paddle/fluid/tests/unittests/test_translated_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_translated_layer.py
--- a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat