diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index 5d1aa7d7e651610b1a268482f59f3bb2e3d49209..1bee3d9351bd59330794ca59b592349e7bee60b7 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -35,6 +35,9 @@ size_t AppendPythonCallableObjectAndReturnId(const py::object &py_obj) {
   return g_py_callables.size() - 1;
 }
 
+// Return py::object* instead of py::object
+// Returning py::object would cause reference count increasing
+// but without GIL, reference count in Python may not be safe
 static py::object *GetPythonCallableObject(size_t i) {
   PADDLE_ENFORCE_LT(i, g_py_callables.size(), "Invalid python callable id");
   return &g_py_callables[i];
@@ -47,7 +50,7 @@ static std::string PythonObjectToString(const py::object &py_callable) {
 
 static void CallPythonFunc(py::object *callable,
                            const std::vector<framework::LoDTensor> &ins,
-                           std::vector<framework::LoDTensor *> *out) {
+                           std::vector<framework::LoDTensor *> *outs) {
   py::gil_scoped_acquire guard;
   py::tuple in_args(ins.size());
   for (size_t i = 0; i < ins.size(); ++i) {
@@ -57,8 +60,8 @@ static void CallPythonFunc(py::object *callable,
   auto ret = (*callable)(*in_args);
   auto ret_tuple = py::cast<py::tuple>(ret);
   size_t ret_num = py::len(ret_tuple);
-  size_t out_num = out->size();
-  if (ret_num != out_num) {
+  size_t out_num = outs->size();
+  if (UNLIKELY(ret_num != out_num)) {
     // Python function has no return values or returns None
     // In this case, ret_num = 1 && ret[0] == None && out_num should be 0
     // Otherwise, ret_num must be equal to out_num
@@ -69,17 +72,18 @@ static void CallPythonFunc(py::object *callable,
   }
 
   for (size_t i = 0; i < out_num; ++i) {
-    if ((*out)[i] == nullptr) {
+    auto *out = (*outs)[i];
+    if (out == nullptr) {
       continue;
     }
     try {
-      auto *out_tensor = py::cast<framework::LoDTensor *>(ret_tuple[i]);
-      PADDLE_ENFORCE_NOT_NULL(out_tensor,
+      auto *py_out_tensor = py::cast<framework::LoDTensor *>(ret_tuple[i]);
+      PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
                               "Output tensor %d should not be nullptr", i);
-      (*out)[i]->set_lod(out_tensor->lod());
-      (*out)[i]->ShareDataWith(*out_tensor);
+      out->set_lod(py_out_tensor->lod());
+      out->ShareDataWith(*py_out_tensor);
     } catch (py::cast_error &) {
-      PADDLE_THROW("Output %d is not LoDTensor", i);
+      PADDLE_THROW("The %d-th output must be LoDTensor", i);
     }
   }
 }
@@ -94,6 +98,10 @@ class PyFuncOpShapeInference : public framework::InferShapeBase {
     PADDLE_ENFORCE_GE(ctx->Attrs().Get<int>(kForwardPythonCallableId), 0,
                       "Function id cannot be less than 0");
 
+    // Transverse all outputs
+    // If name of any output ends with @GRAD,
+    // set its shape, dtype, lod_level, type to be the same as
+    // the correponding forward variable
     auto *op = boost::get<const framework::OpDesc *>(ctx->GetOp());
     auto *block = op->Block();
     const std::string kGradVarSuffix = framework::kGradVarSuffix;
@@ -115,7 +123,7 @@ class PyFuncOpShapeInference : public framework::InferShapeBase {
         auto *in_var_desc = block->FindVarRecursive(fwd_var_name);
         PADDLE_ENFORCE_NOT_NULL(in_var_desc, "Forward variable %s not found",
                                 fwd_var_name);
-        VLOG(10) << "Infer shape of Out(" << out_name << ") as Input("
+        VLOG(10) << "Infer shape of Output(" << out_name << ") as Input("
                  << in_var_desc->Name() << ")";
         out_var_desc->SetShape(in_var_desc->GetShape());
         out_var_desc->SetDataType(in_var_desc->GetDataType());
@@ -135,7 +143,7 @@ class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Index of registered forward Python function.")
         .SetDefault(0);
     AddAttr<int>(kBackwardPythonCallableId,
-                 "Index of registered backward Python function")
+                 "Index of registered backward Python function.")
         .SetDefault(-1);
     AddAttr<std::vector<std::string>>(kPyFuncBackwardSkipVars,
                                       "Unused forward in/out in backward op")
@@ -170,8 +178,7 @@ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase {
     auto fwd_outs = Output("Out");
 
     // For memory reused, some inputs/output in forward part may be not needed
-    // in backward part
-    // Just skip these vars
+    // in backward part. Skipping these vars helps to save memory
     auto &backward_skip_var_list = boost::get<std::vector<std::string>>(
         fwd_attrs.at(kPyFuncBackwardSkipVars));
     std::unordered_set<std::string> backward_skip_var_set(
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 348a0739152969dbb66fea453bbec597de7efcfc..208efbea4a57f302d9085448b67a4753eccec5d9 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -104,7 +104,7 @@ PYBIND11_MODULE(core, m) {
   BindException(&m);
 
   m.def(
-      "append_python_callable_object_and_return_id",
+      "_append_python_callable_object_and_return_id",
       [](py::object py_obj) -> size_t {
         return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj);
       });
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index db7ec9d021f79f55ce4632ffef6d17204ca14433..3cd0a2887e548b4718b0a5a16f4884a014ef43bf 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9137,8 +9137,13 @@ class PyFuncRegistry(object):
 
         self._func = func
         # find named args using reflection 
-        self._named_args = inspect.getargspec(self._func)[0]
-        self._id = core.append_python_callable_object_and_return_id(self)
+        args = inspect.getargspec(self._func)
+        if len(args[0]) == 0 and args[1] is None and args[2] is None:
+            # Function with no inputs
+            self._named_args = None
+        else:
+            self._named_args = args[0]
+        self._id = core._append_python_callable_object_and_return_id(self)
         '''
         Why record self here?
 
@@ -9168,13 +9173,16 @@ class PyFuncRegistry(object):
         return self._id
 
     def __call__(self, *args):
-        kwargs = dict()
-        idx = 0
-        for arg in self._named_args:
-            kwargs[arg] = args[idx]
-            idx += 1
+        if self._named_args is None:
+            func_ret = self._func()
+        else:
+            kwargs = dict()
+            idx = 0
+            for arg in self._named_args:
+                kwargs[arg] = args[idx]
+                idx += 1
+            func_ret = self._func(*args[idx:], **kwargs)
 
-        func_ret = self._func(*args[idx:], **kwargs)
         if not isinstance(func_ret, (list, tuple)):
             func_ret = (func_ret, )
 
@@ -9207,14 +9215,18 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
 
     User should set the right data type and shape of :code:`out` before
     calling this function. However, data types and shapes of gradients of
-    :code:`out` and :code:`x` would be infered automatically.
+    :code:`out` and :code:`x` would be inferred automatically.
 
-    The orders of inputs of :code:`backward_func` would be: forward input
-    :code:`x`, forward output :code:`out` and backward input gradient of
+    Input orders of :code:`backward_func` would be: forward inputs
+    :code:`x`, forward outputs :code:`out` and backward input gradients of
     :code:`out`. If some variables of :code:`out` have no gradient, the input
     tensor would be None in Python side. If some variables of :code:`in` have
     no gradient, users should return None.
 
+    This function can also be used to debug the running network. User can
+    add a :code:`py_func` operator without output, and print input 
+    :code:`x` inside :code:`func`.
+
     Args:
         func (callable): forward Python function.
         x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`.
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index 491bbc219024d921da483ddbee9e75b5add11afd..943ad3ed22480193dc51375cdcca5ed36ce35158 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -25,6 +25,14 @@ if fluid.core.is_compiled_with_cuda():
 os.environ['CPU_NUM'] = str(dev_cnt)
 
 
+def dummy_func_with_no_input():
+    return float(1.0)
+
+
+def dummy_func_with_no_output(x):
+    pass
+
+
 def tanh(x):
     return np.tanh(x)
 
@@ -86,13 +94,20 @@ def simple_fc_net(img, label, use_py_func_op):
     else:
         loss = fluid.default_main_program().current_block().create_var(
             name='loss', dtype='float32', shape=[-1, 1])
-        fluid.layers.py_func(
+        loss = fluid.layers.py_func(
             func=cross_entropy,
             x=[prediction, label],
             out=loss,
             backward_func=cross_entropy_grad,
             skip_vars_in_backward_input=loss)
 
+        dummy_var = fluid.default_main_program().current_block().create_var(
+            name='test_tmp_var', dtype='float32', shape=[1])
+        fluid.layers.py_func(
+            func=dummy_func_with_no_input, x=None, out=dummy_var)
+
+        fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None)
+
     loss = fluid.layers.mean(loss)
     return loss