diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
index b2a6aec0ad282d50c504f79067e4ceb4372d5a2a..c41a2ed83591459289647ddad59db1636aabd3f7 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -36,6 +36,16 @@ std::string CodeGenerator::Generate(SubGraph* subgraph) {
   return Generate(subgraph->func_name, expressions);
 }
 
+static bool HasInput(Node* n, std::string name) {
+  PADDLE_ENFORCE_EQ(n && n->IsOp() && n->Op(), true,
+                    platform::errors::InvalidArgument(
+                        "Expected node %p to be an operator node.", n));
+  std::vector<std::string> input_names = n->Op()->InputNames();
+  std::unordered_set<std::string> input_names_set(input_names.begin(),
+                                                  input_names.end());
+  return input_names_set.find(name) != input_names_set.end();
+}
+
 std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
     SubGraph* subgraph) {
   std::unordered_map<std::string, int> var_ids = EncodeVarNodes(subgraph);
@@ -45,19 +55,20 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
       auto* op = node->Op();
 
       // Input ids should be set in fixed order, like:
-      //  - x, y in forward operations
-      //  - x, y, out, out@GRAD in backward operations
+      //  - X, Y in forward operations
+      //  - X, Y, Out, out@GRAD in backward operations
       std::vector<int> input_ids;
       std::vector<std::string> input_names =
           OperationMap::Instance().Get(op->Type()).input_names;
       for (auto& name : input_names) {
-        // TODO(liuyiqun): support duplicated input.
-        if (op->Input(name).size() >= 1U) {
-          // Some input vars are not used in grad ops, such as
-          // "elementwise_add_grad", where "X", "Y" and "Out" are not used.
-          PADDLE_ENFORCE_NE(var_ids.find(op->Input(name)[0]), var_ids.end(),
-                            "Input(%s) of operation %s should be set.", name,
-                            op->Type());
+        // Some input vars are not used in grad ops, such as
+        // "elementwise_add_grad", where "X", "Y" and "Out" are not used.
+        if (HasInput(node, name) && op->Input(name).size() >= 1U) {
+          // TODO(liuyiqun): support duplicated input.
+          PADDLE_ENFORCE_NE(
+              var_ids.find(op->Input(name)[0]), var_ids.end(),
+              platform::errors::InvalidArgument(
+                  "Input(%s) of operation %s is not set.", name, op->Type()));
           input_ids.push_back(var_ids[op->Input(name)[0]]);
         } else {
           input_ids.push_back(-1);
@@ -69,12 +80,14 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
       std::vector<std::string> output_names =
           OperationMap::Instance().Get(op->Type()).output_names;
       for (auto& name : output_names) {
-        PADDLE_ENFORCE_EQ(op->Output(name).size(), 1U,
-                          "Output(%s) of operation %s should be set.", name,
-                          op->Type());
-        PADDLE_ENFORCE_NE(var_ids.find(op->Output(name)[0]), var_ids.end(),
-                          "Output(%s) of operation %s should be set.", name,
-                          op->Type());
+        PADDLE_ENFORCE_EQ(
+            op->Output(name).size(), 1U,
+            platform::errors::InvalidArgument(
+                "Output(%s) of operation %s is not set.", name, op->Type()));
+        PADDLE_ENFORCE_NE(
+            var_ids.find(op->Output(name)[0]), var_ids.end(),
+            platform::errors::InvalidArgument(
+                "Output(%s) of operation %s is not set.", name, op->Type()));
         output_ids.push_back(var_ids[op->Output(name)[0]]);
       }
       expressions.push_back(
@@ -218,8 +231,9 @@ std::unordered_map<std::string, int> CodeGenerator::EncodeVarNodes(
       }
       PADDLE_ENFORCE_EQ(
           is_found, true,
-          "Subgraph with internal var nodes (%s) is not supported yet.",
-          node->Name());
+          platform::errors::Unimplemented(
+              "Subgraph with internal var nodes (%s) is not supported yet.",
+              node->Name()));
     }
   }
   // Encoding output vars.
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
index 38d5ebde87e3208ae3111dd591db9d5a76de4d84..be06a620f78ae31e6fd457f52e7141b50c1570d3 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
@@ -45,11 +45,16 @@ std::string OperationExpression::GetRHS(std::unordered_set<int>* used,
       }
       std::string index_str = rhs.substr(pos + 2, length);
       int index = StringTo<int>(index_str);
-      PADDLE_ENFORCE_LT(index, input_ids_.size(),
-                        "Only %d inputs are provided, but need %d.",
-                        input_ids_.size(), index + 1);
-      PADDLE_ENFORCE_GE(input_ids_[index], 0,
-                        "Input id should be no less than 0.");
+      PADDLE_ENFORCE_LT(
+          index, input_ids_.size(),
+          platform::errors::InvalidArgument(
+              "Only %d inputs are provided, but need %d for operation < %s >.",
+              input_ids_.size(), index + 1, op_type_));
+      PADDLE_ENFORCE_GE(
+          input_ids_[index], 0,
+          platform::errors::InvalidArgument(
+              "Expected %d-th input id > 0 for operation < %s >. Received %d.",
+              index, op_type_, input_ids_[index]));
       rhs.replace(pos, length + 3, TmpName(input_ids_[index]));
       used->insert(input_ids_[index]);
     }
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
index ed4ce665d2b6ba43d17538c60e3517064e7430e4..140e0d3a06bb302f305409f2d2031a605a289c8f 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
@@ -113,7 +113,8 @@ class CodeTemplate {
 
     for (auto iter : template_var.Get()) {
       PADDLE_ENFORCE_NE(found.find(iter.first), found.end(),
-                        "Keyword %s in template is not set.", iter.first);
+                        platform::errors::PreconditionNotMet(
+                            "Keyword %s in template is not set.", iter.first));
     }
 
     return EmitIndents(ret);
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index bc9f2408b67cd1b1d47470795a5cc82e025c46c5..9515237f964409adc083f83a03acde85983b32a4 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -35,7 +35,7 @@ namespace fusion_group {
 inline float relu(float x) { return x > 0 ? x : 0.; }
 
 inline float relu_grad_dx(float x, float out, float dout) {
-  return x > 0 ? dout : 0;
+  return out > 0 ? dout : 0;
 }
 
 // sigmoid
@@ -117,7 +117,7 @@ void CheckOutput(const std::vector<OperationExpression>& expressions,
           elementwise_mul(var[input_ids[0]], var[input_ids[1]]);
     } else if (op_type == "relu_grad") {
       var[output_ids[0]] =
-          relu_grad_dx(var[input_ids[0]], 0, var[input_ids[2]]);
+          relu_grad_dx(0, var[input_ids[1]], var[input_ids[2]]);
     } else if (op_type == "sigmoid_grad") {
       var[output_ids[0]] =
           sigmoid_grad_dx(0, var[input_ids[1]], var[input_ids[2]]);
@@ -138,8 +138,7 @@ void CheckOutput(const std::vector<OperationExpression>& expressions,
   for (auto id : output_ids_of_subgraph) {
     float actual = cpu_tensors[id].data<float>()[i];
     float expect = var[id];
-    PADDLE_ENFORCE_LT(fabs(actual - expect), 1.E-05,
-                      "Get %f vs %f (actual vs expect).", actual, expect);
+    EXPECT_LT(fabs(actual - expect), 1.E-05);
   }
 }
 
@@ -150,8 +149,7 @@ void SetupRandomCPUTensor(LoDTensor* tensor) {
   std::uniform_real_distribution<double> uniform_dist(0, 1);
 
   T* ptr = tensor->data<T>();
-  PADDLE_ENFORCE_NOT_NULL(
-      ptr, "Call mutable_data to alloc memory for Tensor first.");
+  EXPECT_NE(ptr, nullptr);
   for (int64_t i = 0; i < tensor->numel(); ++i) {
     ptr[i] = static_cast<T>(uniform_dist(rng)) - static_cast<T>(0.5);
   }
@@ -283,7 +281,7 @@ TEST(code_generator, elementwise_grad) {
   // t3 = relu(t2)
   // t2' = relu_grad(t2, t3, t3')
   // t0', t1' = elementwise_mul_grad(t0, t1, t2, t2')
-  fusion_group::OperationExpression exp1("relu_grad", {2, -1, 7}, {6});
+  fusion_group::OperationExpression exp1("relu_grad", {-1, 3, 7}, {6});
   fusion_group::OperationExpression exp2("elementwise_mul_grad", {0, 1, 2, 6},
                                          {4, 5});
   std::vector<fusion_group::OperationExpression> expressions = {exp1, exp2};
@@ -300,7 +298,7 @@ TEST(code_generator, elementwise_grad) {
   //  Op(relu_grad), inputs:{2,3,7}, outputs:{6}
   //  Op(elementwise_mul_grad), inputs:{0,1,2,6}, outputs:{4,5}
   int n = cpu_tensors[0].numel();
-  std::vector<int> input_ids = {0, 1, 2, -1, 7};
+  std::vector<int> input_ids = {0, 1, 2, 3, 7};
   std::vector<int> output_ids = {4, 5, 6};
   TestMain("elementwise_grad_kernel_0", expressions, cpu_tensors, n, input_ids,
            output_ids);
@@ -332,22 +330,25 @@ std::unique_ptr<paddle::framework::ir::Graph> BuildGraph(
   //                      tmp_2@GRAD(13), x2@GRAD(14), x0@GRAD(15),
   //                      x3@GRAD(16), x1@GRAD(17)
   paddle::framework::ir::Layers layers;
-  auto* x0 = layers.data("x0", {16, 32});
+  std::vector<int64_t> shape = {16, 32};
+  auto* x0 = layers.data("x0", shape);
   auto* tmp_0 = layers.sigmoid(x0);
-  tmp_0->SetShape({16, 32});
-  auto* x1 = layers.data("x1", {16, 32});
+  auto* x1 = layers.data("x1", shape);
   auto* tmp_1 = layers.elementwise_mul(tmp_0, x1);
-  tmp_1->SetShape({16, 32});
-  auto* x2 = layers.data("x2", {16, 32});
+  auto* x2 = layers.data("x2", shape);
   auto* tmp_2 = layers.tanh(x2);
-  tmp_2->SetShape({16, 32});
-  auto* x3 = layers.data("x3", {16, 32});
+  auto* x3 = layers.data("x3", shape);
   auto* tmp_3 = layers.elementwise_mul(x3, tmp_2);
-  tmp_3->SetShape({16, 32});
-  layers.elementwise_add(tmp_1, tmp_3);
+  auto* tmp_4 = layers.elementwise_add(tmp_1, tmp_3);
+
+  std::vector<paddle::framework::VarDesc*> elementwise_vars = {
+      tmp_0, tmp_1, tmp_2, tmp_3, tmp_4};
+  for (auto* var : elementwise_vars) {
+    var->SetShape(shape);
+  }
 
   if (backward) {
-    layers.backward();
+    layers.backward({tmp_4});
   }
 
   std::unique_ptr<paddle::framework::ir::Graph> graph(
diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
index a023897d121ec3df019c8f796fa98b56f5d69b43..172ec0c0ee84d06440626caad4771199e5771aae 100644
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
@@ -22,17 +22,15 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-void VisualizeGraph(std::unique_ptr<Graph> graph, std::string graph_viz_path) {
+void VisualizeGraph(std::unique_ptr<Graph>* graph, std::string graph_viz_path) {
   // Insert a graph_viz_pass to transform the graph to a .dot file.
   // It can be used for debug.
   auto graph_viz_pass = PassRegistry::Instance().Get("graph_viz_pass");
   graph_viz_pass->Set("graph_viz_path", new std::string(graph_viz_path));
-  graph.reset(graph_viz_pass->Apply(graph.release()));
+  graph->reset(graph_viz_pass->Apply(graph->release()));
 }
 
-TEST(FusionGroupPass, elementwise_list) {
-  fusion_group::OperationMap::Init();
-
+std::unique_ptr<Graph> BuildElementwiseListGraph(bool backward = false) {
   // inputs                     operator            output
   // --------------------------------------------------------
   // (x, y)                     mul              -> tmp_0
@@ -42,34 +40,33 @@ TEST(FusionGroupPass, elementwise_list) {
   //
   // Expression: tmp_3 = relu(mul(x, y) + z) + w
   Layers layers;
+  std::vector<int64_t> shape = {16, 32};
   auto* x = layers.data("x", {16, 16});
   auto* y = layers.data("y", {16, 32});
   auto* tmp_0 = layers.mul(x, y);
-  tmp_0->SetShape({16, 32});
-  auto* z = layers.data("z", {16, 32});
+  auto* z = layers.data("z", shape);
   auto* tmp_1 = layers.elementwise_add(tmp_0, z);
   auto* tmp_2 = layers.relu(tmp_1);
-  tmp_2->SetShape({16, 32});
-  auto* w = layers.data("w", {16, 32});
-  layers.elementwise_add(tmp_2, w);
-
-  std::unique_ptr<Graph> graph(new Graph(layers.main_program()));
-  // VisualizeGraph(graph, "00_elementwise_list.dot");
-
-  auto fusion_group_pass = PassRegistry::Instance().Get("fusion_group_pass");
-  VLOG(3) << DebugString(graph);
+  auto* w = layers.data("w", shape);
+  auto* tmp_3 = layers.elementwise_add(tmp_2, w);
+  std::vector<VarDesc*> elementwise_vars = {tmp_0, z, tmp_1, tmp_2, w, tmp_3};
+  for (auto* var : elementwise_vars) {
+    var->SetShape(shape);
+  }
 
-  graph.reset(fusion_group_pass->Apply(graph.release()));
-  // VisualizeGraph(graph, "01_elementwise_list.fusion_group.dot");
-  int num_fusion_group_ops = GetNumOpNodes(graph, "fusion_group");
-  VLOG(3) << DebugString(graph);
+  if (backward) {
+    layers.backward({tmp_3});
+  }
 
-  PADDLE_ENFORCE_EQ(num_fusion_group_ops, 1);
+  std::unique_ptr<Graph> graph(new Graph(layers.main_program()));
+#ifdef __clang__
+  return graph;
+#else
+  return std::move(graph);
+#endif
 }
 
-TEST(FusionGroupPass, elementwise_tree) {
-  fusion_group::OperationMap::Init();
-
+std::unique_ptr<Graph> BuildElementwiseTreeGraph(bool backward = false) {
   // inputs                     operator            output
   // --------------------------------------------------------
   // (x0, y0)                   mul              -> tmp_0
@@ -88,53 +85,72 @@ TEST(FusionGroupPass, elementwise_tree) {
   //             tmp_9 = tanh(x4) * sigmoid(x5)
   //             tmp_10 = mul(tmp_6, tmp_9)
   Layers layers;
+  std::vector<int64_t> shape = {16, 32};
   auto* x0 = layers.data("x0", {16, 16});
   auto* y0 = layers.data("y0", {16, 32});
   auto* tmp_0 = layers.mul(x0, y0);
-  tmp_0->SetShape({16, 32});
-
-  auto* x1 = layers.data("x1", {16, 32});
+  auto* x1 = layers.data("x1", shape);
   auto* tmp_1 = layers.sigmoid(x1);
-  tmp_1->SetShape({16, 32});
-
   auto* tmp_2 = layers.elementwise_mul(tmp_0, tmp_1);
-  tmp_2->SetShape({16, 32});
-
-  auto* x2 = layers.data("x2", {16, 32});
+  auto* x2 = layers.data("x2", shape);
   auto* tmp_3 = layers.sigmoid(x2);
-  tmp_3->SetShape({16, 32});
-  auto* x3 = layers.data("x3", {16, 32});
+  auto* x3 = layers.data("x3", shape);
   auto* tmp_4 = layers.tanh(x3);
-  tmp_4->SetShape({16, 32});
   auto* tmp_5 = layers.elementwise_mul(tmp_3, tmp_4);
-  tmp_5->SetShape({16, 32});
-
   auto* tmp_6 = layers.elementwise_add(tmp_2, tmp_5);
-  tmp_6->SetShape({16, 32});
-
-  auto* x4 = layers.data("x4", {16, 32});
+  auto* x4 = layers.data("x4", shape);
   auto* tmp_7 = layers.tanh(x4);
-  tmp_7->SetShape({16, 32});
-  auto* x5 = layers.data("x5", {16, 32});
+  auto* x5 = layers.data("x5", shape);
   auto* tmp_8 = layers.sigmoid(x5);
-  tmp_8->SetShape({16, 32});
-
   auto* tmp_9 = layers.elementwise_mul(tmp_7, tmp_8);
-  tmp_9->SetShape({16, 32});
-  layers.mul(tmp_6, tmp_9);
+  auto* tmp_10 = layers.mul(tmp_6, tmp_9);
+
+  std::vector<VarDesc*> elementwise_vars = {tmp_0, tmp_1, tmp_2, tmp_3, tmp_4,
+                                            tmp_5, tmp_6, tmp_7, tmp_8, tmp_9};
+  for (auto* var : elementwise_vars) {
+    var->SetShape(shape);
+  }
+
+  if (backward) {
+    layers.backward({tmp_10});
+  }
 
   std::unique_ptr<Graph> graph(new Graph(layers.main_program()));
-  // VisualizeGraph(graph, "00_elementwise_tree.dot");
+#ifdef __clang__
+  return graph;
+#else
+  return std::move(graph);
+#endif
+}
 
-  auto fusion_group_pass = PassRegistry::Instance().Get("fusion_group_pass");
+int TestMain(std::unique_ptr<Graph> graph, std::string prefix) {
+  // VisualizeGraph(&graph, prefix + ".dot");
+  auto pass = PassRegistry::Instance().Get("fusion_group_pass");
+  pass->Set("use_gpu", new bool(true));
   VLOG(3) << DebugString(graph);
 
-  graph.reset(fusion_group_pass->Apply(graph.release()));
-  // VisualizeGraph(graph, "01_elementwise_tree.fusion_group.dot");
+  graph.reset(pass->Apply(graph.release()));
+  // VisualizeGraph(&graph, prefix + ".fusion_group.dot");
   int num_fusion_group_ops = GetNumOpNodes(graph, "fusion_group");
   VLOG(3) << DebugString(graph);
 
-  PADDLE_ENFORCE_EQ(num_fusion_group_ops, 2);
+  return num_fusion_group_ops;
+}
+
+TEST(FusionGroupPass, elementwise_list) {
+  fusion_group::OperationMap::Init();
+
+  std::unique_ptr<Graph> graph = BuildElementwiseListGraph(false);
+  int num_fusion_group_ops = TestMain(std::move(graph), "elementwise_list");
+  EXPECT_EQ(num_fusion_group_ops, 1);
+}
+
+TEST(FusionGroupPass, elementwise_tree) {
+  fusion_group::OperationMap::Init();
+
+  std::unique_ptr<Graph> graph = BuildElementwiseTreeGraph(false);
+  int num_fusion_group_ops = TestMain(std::move(graph), "elementwise_tree");
+  EXPECT_EQ(num_fusion_group_ops, 2);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index ad89cc13cb2a1c65b2005f530454c0aab2bfa467..912044611f6f6de00a9ba0c7fbe2d36eb5cd4ceb 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -43,7 +43,11 @@ void OperationMap::Insert(int type, int num_operands, std::string op_type,
                           std::vector<std::string> input_names,
                           std::vector<std::string> output_names) {
   Operation op(type, num_operands, op_type, {expr}, input_names, output_names);
-  PADDLE_ENFORCE_EQ(op.IsValid(), true, "Operation %s is invalid.", op_type);
+  PADDLE_ENFORCE_EQ(op.IsValid(), true,
+                    platform::errors::InvalidArgument(
+                        "Operation %s is invalid. Please set correct "
+                        "expression for forward calculation.",
+                        op_type));
   operations_[op_type] = op;
 
   if (grad_exprs.size() > 0U) {
@@ -63,8 +67,11 @@ void OperationMap::Insert(int type, int num_operands, std::string op_type,
     }
     Operation grad_op(type, num_operands, grad_op_type, grad_exprs,
                       grad_input_names, grad_output_names);
-    PADDLE_ENFORCE_EQ(grad_op.IsValid(), true, "Operation %s is invalid.",
-                      grad_op_type);
+    PADDLE_ENFORCE_EQ(grad_op.IsValid(), true,
+                      platform::errors::InvalidArgument(
+                          "Operation %s is invalid. Please set correct "
+                          "expression for backward calculation.",
+                          grad_op_type));
     operations_[grad_op_type] = grad_op;
   }
 }
@@ -83,8 +90,8 @@ void OperationMap::InsertUnaryElementwiseOperations() {
 
   // relu:
   //  out = f(x) = x > 0 ? x : 0
-  //  dx = dout * (out > 0 ? 1 : 0) = dout * (x > 0 ? 1 : 0)
-  insert_handler("relu", "real_max(${0}, 0)", {"${0} > 0 ? ${2} : 0"});
+  //  dx = dout * (out > 0 ? 1 : 0)
+  insert_handler("relu", "real_max(${0}, 0)", {"${1} > 0 ? ${2} : 0"});
   // sigmoid:
   //  out = f(x) = 1.0 / (1.0 + exp(-x))
   //  dx = dout * out * (1 - out)
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.h b/paddle/fluid/framework/ir/fusion_group/operation.h
index 3145e3052efb6b5bbd0fc25e75f77e9590b9b146..d23bea8a437090d9effb8259df0ab120e1b66e08 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.h
+++ b/paddle/fluid/framework/ir/fusion_group/operation.h
@@ -70,7 +70,10 @@ class OperationMap {
   OperationMap();
 
   static OperationMap& Instance() {
-    PADDLE_ENFORCE_NOT_NULL(map, "Need to initialize OperationMap first!");
+    PADDLE_ENFORCE_NOT_NULL(
+        map, platform::errors::PreconditionNotMet(
+                 "Please initialize OperationMap first, by calling "
+                 "framework::fusion_group::OperationMap::Init()!"));
     return *map;
   }
 
diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h
index a89f0c3b661db7d9b9794d6edb792654846d09f5..3c74612552597ff469a3e32ccb0f59891bdf437d 100644
--- a/paddle/fluid/framework/ir/pass_tester_helper.h
+++ b/paddle/fluid/framework/ir/pass_tester_helper.h
@@ -270,9 +270,19 @@ struct Layers {
     return outs;
   }
 
-  void backward() {
+  void backward(std::vector<VarDesc*> targets) {
+    // This function is designed to simulate the structure of training program,
+    //  but is constructed differently as the actual program.
     BlockDesc* block = program_.MutableBlock(0);
     std::vector<OpDesc*> forward_ops = block->AllOps();
+    for (auto* var : targets) {
+      OpDesc* none_op = block->AppendOp();
+      none_op->SetType("none");
+      none_op->SetInput("X", {var->Name()});
+      VarDesc* grad_var =
+          lod_tensor(GradVarName(var->Name()), var->GetShape(), false);
+      none_op->SetOutput("Out", {grad_var->Name()});
+    }
     for (int i = forward_ops.size() - 1; i >= 0; --i) {
       OpDesc* op = forward_ops[i];
       OpDesc* grad_op = block->AppendOp();
@@ -428,8 +438,21 @@ static std::string DebugString(Node* node) {
       is_first = false;
     }
     os << "}.";
-  } else if (node->IsVar() && node->Var()) {
-    os << "Node(" << node->Name() << "), inputs:{";
+  } else {
+    os << "Node(" << node->Name();
+    if (node->IsVar() && node->Var()) {
+      os << "{";
+      bool is_first = true;
+      for (auto dim : node->Var()->GetShape()) {
+        if (!is_first) {
+          os << "x";
+        }
+        os << dim;
+        is_first = false;
+      }
+      os << "}";
+    }
+    os << "), inputs:{";
     bool is_first = true;
     for (auto* in : node->inputs) {
       if (!is_first) {
@@ -477,12 +500,16 @@ static std::string DebugString(const std::unordered_set<Node*>& nodes) {
   return DebugString(vec);
 }
 
-static std::string DebugString(const std::unique_ptr<Graph>& graph) {
+static std::string DebugString(Graph* graph) {
   std::ostringstream os;
   os << "Graph: {\n" << DebugString(graph->Nodes()) << "}\n";
   return os.str();
 }
 
+static std::string DebugString(const std::unique_ptr<Graph>& graph) {
+  return DebugString(graph.get());
+}
+
 static int GetNumOpNodes(const std::unique_ptr<Graph>& graph,
                          std::string op_type) {
   int num_nodes = 0;
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
index aa6bce6f1e54f86098b6df729ff00373ff0a638f..9331532058095df61b0a07d78c9ebb14e8be8ce8 100644
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -85,7 +85,8 @@ TEST(DeviceCode, cuda) {
 }
 
 TEST(DeviceCodePool, cuda) {
-  if (!paddle::platform::dynload::HasNVRTC()) {
+  if (!paddle::platform::dynload::HasNVRTC() ||
+      !paddle::platform::dynload::HasCUDADriver()) {
     return;
   }