diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 896918a607106380158ca9b9d40cdf5e5ad990f5..7bcc3d6c608c947f71ae030cfb17d4a89495939e 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -471,12 +471,20 @@ void BasicEngine::Execute() {
 
       {
         VLOG(3) << "Start to execute grad op " << cur_op.Type();
-        if (tmp_ins_ptr == nullptr) {
-          OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
-                      cur_op.place());
-        } else {
-          OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, cur_op.Attrs(),
-                      cur_op.place());
+        try {
+          if (tmp_ins_ptr == nullptr) {
+            OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
+                        cur_op.place());
+          } else {
+            OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs,
+                        cur_op.Attrs(), cur_op.place());
+          }
+        } catch (platform::EnforceNotMet& exception) {
+          Clear();
+          throw std::move(exception);
+        } catch (std::exception& ex) {
+          Clear();
+          PADDLE_THROW(platform::errors::External("%s", ex.what()));
         }
       }
 
diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
index bd132f2576fec14511523958d4ce64077b99b1f1..ccfd5b0e2dbfcd28692e899f17566cb4b6cf9344 100644
--- a/paddle/fluid/imperative/py_layer_fwd.h
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -115,12 +115,12 @@ py::object PyLayerApply(const platform::Place& place, const py::object& cls,
               tuple_result[i].cast<std::shared_ptr<imperative::VarBase>>();
           output_vars.push_back(temp_out);
         } catch (py::cast_error&) {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "The output of `PyLayer.forward` should be `Tensor`."));
+          // Only collect Tensor type in 'kwargs' and pass them to backward.
+          // Ignore other types of input temporarily.
         }
       } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "The output of `PyLayer.forward` can not be `None`."));
+        // Only collect Tensor type in 'kwargs' and pass them to backward.
+        // Ignore other types of input temporarily.
       }
     }
   } else {
@@ -130,14 +130,18 @@ py::object PyLayerApply(const platform::Place& place, const py::object& cls,
             result_forward.cast<std::shared_ptr<imperative::VarBase>>();
         output_vars.push_back(temp_out);
       } catch (py::cast_error&) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "The output of `PyLayer.forward` should be `Tensor`."));
+        // Only collect Tensor type in 'kwargs' and pass them to backward.
+        // Ignore other types of input temporarily.
       }
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "The output of `PyLayer.forward` can not be `None`."));
+      // Only collect Tensor type in 'kwargs' and pass them to backward.
+      // Ignore other types of input temporarily.
     }
   }
+  if (output_vars.size() == 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "At least one output of `PyLayer.forward` is a `Tensor`."));
+  }
 
   NameVarBaseMap outs = {{"Out", output_vars}};
 
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
index 65e10181dcc3df06395ae5cae65efb251021857e..0090747d1161a4c02fc67407d92b99aff5faec30 100644
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -86,6 +86,12 @@ void RunPyObject(py::object *py_object,
       }
     }
   } else {
+    if (1 != outs->size()) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The number of outputs of `PyLayer.backward` should be %d, but "
+          "received 1.",
+          outs->size()));
+    }
     if ((*outs)[0] != nullptr) {
       if (Py_None != py_result.ptr()) {
         try {
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
index d329bf570a5845c7c261e53e9bd0c064a908ae09..e058115d691993781d7f6d0fb9aa20b633ab60d9 100644
--- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -30,7 +30,7 @@ class TestPyLayer(unittest.TestCase):
                 y1 = func1(x1)
                 y2 = func1(x2)
                 ctx.save_for_backward(y1, y2)
-                return y1, y2
+                return y1, 1, y2, None
 
             @staticmethod
             def backward(ctx, dy1, dy2):
@@ -44,7 +44,7 @@ class TestPyLayer(unittest.TestCase):
         input1.stop_gradient = False
         input2.stop_gradient = False
         z = tanh.apply(input1, input1, paddle.tanh, paddle.square)
-        z = z[0] + z[1]
+        z = z[0] + z[2]
         z.mean().backward()
 
         z2 = paddle.tanh(input2) + paddle.tanh(input2)
@@ -61,7 +61,7 @@ class TestPyLayer(unittest.TestCase):
                 y1 = func1(x1)
                 y2 = func1(x2)
                 ctx.save_for_backward(y1, y2)
-                return y1, y2
+                return 1, None, y1, y2, ''
 
             @staticmethod
             def backward(ctx, dy1, dy2):
@@ -79,7 +79,7 @@ class TestPyLayer(unittest.TestCase):
         input3.stop_gradient = True
         input4.stop_gradient = True
         z = tanh.apply(input1, input3, paddle.tanh, paddle.square)
-        z = z[0] + z[1]
+        z = z[2] + z[3]
         z.mean().backward()
 
         z2 = paddle.tanh(input2) + paddle.tanh(input4)
@@ -115,6 +115,27 @@ class TestPyLayer(unittest.TestCase):
         self.assertTrue(
             np.max(np.abs((input1.grad.numpy() - input2.grad.numpy()))) < 1e-10)
 
+    def test_pylayer_num_output_match(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(
+                    ctx,
+                    x1,
+                    x2, ):
+                return x1 + x2
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return dy1 + 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        z = tanh.apply(input1, input2)
+        with self.assertRaises(ValueError):
+            z.mean().backward()
+
     def test_pylayer_dtype(self):
         class tanh(PyLayer):
             @staticmethod
@@ -150,21 +171,21 @@ class TestPyLayer(unittest.TestCase):
                 return args
 
         input1 = paddle.randn([2, 3]).astype("float64")
-        with self.assertRaises(NotImplementedError):
+        with self.assertRaises(ValueError):
             z = Layer_None1.apply(input1)
 
         class Layer_None2(PyLayer):
             @staticmethod
             def forward(ctx, *args):
-                return [None, None]
+                return [None, args[0]]
 
             @staticmethod
             def backward(ctx, *args):
                 return args
 
         input1 = paddle.randn([2, 3]).astype("float64")
-        with self.assertRaises(NotImplementedError):
-            z = Layer_None2.apply(input1)
+        # return None
+        z = Layer_None2.apply(input1)
 
         class Layer_one1(PyLayer):
             @staticmethod
@@ -176,21 +197,22 @@ class TestPyLayer(unittest.TestCase):
                 return args
 
         input1 = paddle.randn([2, 3]).astype("float64")
-        with self.assertRaises(NotImplementedError):
+        # At least one output of `PyLayer.backward` is a `Tensor`
+        with self.assertRaises(ValueError):
             z = Layer_one1.apply(input1)
 
         class Layer_one2(PyLayer):
             @staticmethod
             def forward(ctx, *args):
-                return [1, 2]
+                return [1, 2, args[0]]
 
             @staticmethod
             def backward(ctx, *args):
                 return args
 
         input1 = paddle.randn([2, 3]).astype("float64")
-        with self.assertRaises(NotImplementedError):
-            z = Layer_one2.apply(input1)
+        # return int 
+        z = Layer_one2.apply(input1)
 
         class Layer_no_fw(PyLayer):
             @staticmethod
@@ -234,8 +256,7 @@ class TestPyLayer(unittest.TestCase):
         z = Layer_bk_none1.apply(input2)
 
         with self.assertRaises(ValueError):
-            with paddle.fluid.dygraph.guard():
-                z.sum().backward()
+            z.sum().backward()
 
         class Layer_bk_none2(PyLayer):
             @staticmethod
@@ -249,9 +270,9 @@ class TestPyLayer(unittest.TestCase):
         input1 = paddle.randn([2, 3]).astype("float64")
         input1.stop_gradient = False
         z = Layer_bk_none2.apply(input1, input1)
+
         with self.assertRaises(ValueError):
-            with paddle.fluid.dygraph.guard():
-                z.mean().backward()
+            z.mean().backward()
 
         class Layer_bk_one1(PyLayer):
             @staticmethod
@@ -265,9 +286,9 @@ class TestPyLayer(unittest.TestCase):
         input1 = paddle.randn([2, 3]).astype("float64")
         input1.stop_gradient = False
         z = Layer_bk_one1.apply(input1)
+
         with self.assertRaises(ValueError):
-            with paddle.fluid.dygraph.guard():
-                z.mean().backward()
+            z.mean().backward()
 
         class Layer_bk_one2(PyLayer):
             @staticmethod
@@ -280,11 +301,11 @@ class TestPyLayer(unittest.TestCase):
 
         input1 = paddle.randn([2, 3]).astype("float64")
         input1.stop_gradient = False
+
         y = Layer_bk_one2.apply(input1, input1)
         z = y[0] + y[1]
         with self.assertRaises(ValueError):
-            with paddle.fluid.dygraph.guard():
-                z.mean().backward()
+            z.mean().backward()
 
         class Layer_no_bk(PyLayer):
             @staticmethod
@@ -295,10 +316,9 @@ class TestPyLayer(unittest.TestCase):
         input1.stop_gradient = False
         z = Layer_no_bk.apply(input1)
 
-        with self.assertRaises(NotImplementedError):
-            with paddle.fluid.dygraph.guard():
-                z = z[0] + z[1]
-                z.mean().backward()
+        with self.assertRaises(OSError):
+            z = z[0] + z[1]
+            z.mean().backward()
 
         class Layer_bk_match(PyLayer):
             @staticmethod
@@ -313,9 +333,8 @@ class TestPyLayer(unittest.TestCase):
         input1.stop_gradient = False
         z = Layer_bk_match.apply(input1)
         with self.assertRaises(ValueError):
-            with paddle.fluid.dygraph.guard():
-                z = z[0] + z[1]
-                z.mean().backward()
+            z = z[0] + z[1]
+            z.mean().backward()
 
     def test_pylayer_bk_return_none(self):
         class Layer_bk_none1(PyLayer):
@@ -334,8 +353,7 @@ class TestPyLayer(unittest.TestCase):
         z = Layer_bk_none1.apply(input1, input2)
 
         with self.assertRaises(ValueError):
-            with paddle.fluid.dygraph.guard():
-                z.mean().backward()
+            z.mean().backward()
 
         class Layer_bk_none2(PyLayer):
             @staticmethod
@@ -353,8 +371,7 @@ class TestPyLayer(unittest.TestCase):
         z = Layer_bk_none2.apply(input1, input2)
         z = z[0] + z[1]
         with self.assertRaises(ValueError):
-            with paddle.fluid.dygraph.guard():
-                z.mean().backward()
+            z.mean().backward()
 
     def test_pylayer_inplace(self):
         class cus_tanh(PyLayer):