add unittest and fix bug

add API.spec test=develop

add unittest and fix bug
add API.spec test=develop
8b9d33fa · sneaxiy · e240ba29 · 8b9d33fa · 8b9d33fa · 8b9d33fa
4 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -197,6 +197,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
 paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
+paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -26,26 +26,35 @@ namespace py = pybind11;

 static std::vector<py::object> g_py_callables;

+const char kForwardPythonCallableId[] = "forward_callable_id";
+const char kBackwardPythonCallableId[] = "backward_callable_id";
+const char kPyFuncBackwardSkipVars[] = "backward_skip_vars";
+
 size_t AppendPythonCallableObjectAndReturnId(py::object py_obj) {
  g_py_callables.emplace_back(py_obj);
  return g_py_callables.size() - 1;
 }

 static py::object *GetPythonCallableObject(size_t i) {
-  PADDLE_ENFORCE_LT(i, g_py_callables.size());
+  PADDLE_ENFORCE_LT(i, g_py_callables.size(), "Invalid python callable id");
  return &g_py_callables[i];
 }

-void CallPythonFunc(py::object *callable, const std::string &func_token,
+std::string PythonObjectToString(const py::object &py_callable) {
+  py::gil_scoped_acquire guard;
+  return py::str(*py_callable);
+}
+
+void CallPythonFunc(py::object *callable,
                    const std::vector<framework::LoDTensor> &ins,
                    std::vector<framework::LoDTensor *> *out) {
-  py::gil_scoped_acquire guard{};
+  py::gil_scoped_acquire guard;
  py::tuple in_args(ins.size());
  for (size_t i = 0; i < ins.size(); ++i) {
    in_args[i] = ins[i].IsInitialized() ? py::cast(ins[i]) : py::cast(nullptr);
  }

-  auto ret = (*callable)(func_token, *in_args);
+  auto ret = (*callable)(*in_args);
  auto ret_tuple = py::cast<py::tuple>(ret);
  PADDLE_ENFORCE_EQ(py::len(ret_tuple), out->size(), "Output number not match");
  for (size_t i = 0; i < out->size(); ++i) {
@@ -55,7 +64,7 @@ void CallPythonFunc(py::object *callable, const std::string &func_token,
    try {
      auto *out_tensor = py::cast<framework::LoDTensor *>(ret_tuple[i]);
      PADDLE_ENFORCE_NOT_NULL(out_tensor,
-                              "Output tensor should not be nullptr");
+                              "Output tensor %d should not be nullptr", i);
      (*out)[i]->set_lod(out_tensor->lod());
      (*out)[i]->ShareDataWith(*out_tensor);
    } catch (py::cast_error &) {
@@ -69,26 +78,23 @@ class PyFuncOpShapeInference : public framework::InferShapeBase {
  void operator()(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(!ctx->IsRuntime(),
                   "Infer shape cannot be called in runtime.");
-    PADDLE_ENFORCE(ctx->HasInputs("X"), "Input(X) must exist");
-    PADDLE_ENFORCE(ctx->HasOutputs("Out"), "Output(Out) must exist");
+    PADDLE_ENFORCE(ctx->HasInputs("X") || ctx->HasOutputs("Out"),
+                   "Input(X) or Output(Out) must exist");
+    PADDLE_ENFORCE_GE(ctx->Attrs().Get<int>(kForwardPythonCallableId), 0,
+                      "Function id cannot be less than 0");

    auto *op = boost::get<const framework::OpDesc *>(ctx->GetOp());
    auto *block = op->Block();
-    // No need to infer shape in forward part
-    if (block->ForwardBlockID() < 0) {
-      return;
-    }
-
-    PADDLE_ENFORCE(!ctx->Attrs().Get<std::string>("token").empty(),
-                   "Function token cannot be empty");
-
    const std::string kGradVarSuffix = framework::kGradVarSuffix;
    auto out_vars = ctx->GetOutputVarPtrs("Out");
    for (auto &out_var : out_vars) {
      auto *out_var_desc = boost::get<framework::VarDesc *>(out_var);
+      if (out_var_desc == nullptr) {
+        continue;
+      }
      auto out_name = out_var_desc->Name();
      if (out_name == framework::kEmptyVarName ||
-          out_name.size() < kGradVarSuffix.size()) {
+          out_name.size() <= kGradVarSuffix.size()) {
        continue;
      }

@@ -98,6 +104,8 @@ class PyFuncOpShapeInference : public framework::InferShapeBase {
        auto *in_var_desc = block->FindVarRecursive(fwd_var_name);
        PADDLE_ENFORCE_NOT_NULL(in_var_desc, "Forward variable %s not found",
                                fwd_var_name);
+        VLOG(10) << "Infer shape of Out(" << out_name << ") as Input("
+                 << in_var_desc->Name() << ")";
        out_var_desc->SetShape(in_var_desc->GetShape());
        out_var_desc->SetDataType(in_var_desc->GetDataType());
        out_var_desc->SetLoDLevel(in_var_desc->GetLoDLevel());
@@ -112,13 +120,15 @@ class PyFuncOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X", "Inputs of py_func op.").AsDuplicable();
    AddOutput("Out", "Outputs of py_func op").AsDuplicable();
-    AddAttr<int>("handle_idx", "Index of the registered py_func handle")
+    AddAttr<int>(kForwardPythonCallableId,
+                 "Index of registered forward Python function.")
        .SetDefault(0);
-    AddAttr<std::string>("token", "Token of function token to be called")
-        .SetDefault("");
-    AddAttr<std::string>("backward_token",
-                         "Token of backward function to be called")
-        .SetDefault("");
+    AddAttr<int>(kBackwardPythonCallableId,
+                 "Index of registered backward Python function")
+        .SetDefault(-1);
+    AddAttr<std::vector<std::string>>(kPyFuncBackwardSkipVars,
+                                      "Unused forward in/out in backward op")
+        .SetDefault(std::vector<std::string>());
    AddComment(R"DOC("PyFunc Op")DOC");
  }
 };
@@ -129,7 +139,8 @@ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase {

  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
    auto &fwd_attrs = Attrs();
-    if (fwd_attrs.at("backward_token").empty()) {
+    // no backward op when backward_id is less than 0
+    if (boost::get<int>(fwd_attrs.at(kBackwardPythonCallableId)) < 0) {
      return {};
    }

@@ -137,36 +148,65 @@ class PyFuncOpGradDescMaker : public framework::GradOpDescMakerBase {
    grad_op->SetType("py_func");

    framework::AttributeMap bwd_attrs;
-    bwd_attrs["token"] = fwd_attrs.at("backward_token");
-    bwd_attrs["backward_token"] = std::string("");
+    bwd_attrs[kForwardPythonCallableId] =
+        fwd_attrs.at(kBackwardPythonCallableId);
+    bwd_attrs[kBackwardPythonCallableId] = -1;
    grad_op->SetAttrMap(bwd_attrs);

-    auto bwd_in = Input("X");
-    auto fwd_out = Output("Out");
-    auto fwd_out_grad = OutputGrad("Out");
-    bwd_in.insert(bwd_in.end(), fwd_out.begin(), fwd_out.end());
-    bwd_in.insert(bwd_in.end(), fwd_out_grad.begin(), fwd_out_grad.end());
+    // All forward inputs
+    auto fwd_ins = Input("X");
+    // All forward outputs
+    auto fwd_outs = Output("Out");
+
+    // For memory reused, some inputs/output in forward part may be not needed
+    // in backward part
+    // Just skip these vars
+    auto &backward_skip_var_list = boost::get<std::vector<std::string>>(
+        fwd_attrs.at(kPyFuncBackwardSkipVars));
+    std::unordered_set<std::string> backward_skip_var_set(
+        backward_skip_var_list.begin(), backward_skip_var_list.end());
+    std::vector<std::string> bwd_ins;
+    bwd_ins.reserve(fwd_ins.size() + fwd_outs.size());
+    for (auto &fwd_in : fwd_ins) {
+      if (backward_skip_var_set.count(fwd_in) == 0) {
+        bwd_ins.emplace_back(fwd_in);
+      }
+    }
+
+    for (auto &fwd_out : fwd_outs) {
+      if (backward_skip_var_set.count(fwd_out) == 0) {
+        bwd_ins.emplace_back(fwd_out);
+      }
+    }
+
+    // Backward OG cannot be skipped
+    // But in Python side, if OG is kEmptyVarName, input tensor would be None
+    auto fwd_out_grads = OutputGrad("Out");
+    bwd_ins.reserve(bwd_ins.size() + fwd_out_grads.size());
+    bwd_ins.insert(bwd_ins.end(), fwd_out_grads.begin(), fwd_out_grads.end());

-    auto bwd_out = InputGrad("X", false);
+    // Backward IG cannot be skipped
+    // But in Python side, if IG is not needed, users can just return None
+    auto bwd_outs = InputGrad("X", false);

    if (VLOG_IS_ON(10)) {
      std::string in_str = "PyFunc Grad Input: ";
-      for (auto &in : bwd_in) {
+      for (auto &in : bwd_ins) {
        in_str += in;
        in_str += " ";
      }
      VLOG(10) << in_str;

      std::string out_str = "PyFunc Grad Output: ";
-      for (auto &out : bwd_out) {
+      for (auto &out : bwd_outs) {
        out_str += out;
-        out += " ";
+        out_str += " ";
      }
      VLOG(10) << out_str;
    }

-    grad_op->SetInput("X", bwd_in);
-    grad_op->SetOutput("Out", InputGrad("X", false));
+    grad_op->SetInput("X", bwd_ins);
+    grad_op->SetOutput("Out", bwd_outs);

    std::vector<std::unique_ptr<framework::OpDesc>> ret(1);
    ret[0] = std::move(grad_op);
@@ -210,12 +250,11 @@ class PyFuncOp : public framework::OperatorBase {
      outputs[i] = out_tensor;
    }

-    auto &token = Attr<std::string>("token");
-    auto handle_idx = static_cast<size_t>(Attr<int>("handle_idx"));
-    auto *py_callable = GetPythonCallableObject(handle_idx);
-    VLOG(10) << "Call py_func_op with token " << token << ", and handle_idx "
-             << handle_idx;
-    CallPythonFunc(py_callable, token, inputs, &outputs);
+    auto callable_id = static_cast<size_t>(Attr<int>(kForwardPythonCallableId));
+    auto *py_callable = GetPythonCallableObject(callable_id);
+    VLOG(10) << "Call py_func_op with id " << callable_id << ": "
+             << PythonObjectToString(*py_callable);
+    CallPythonFunc(py_callable, inputs, &outputs);
  }
 };


--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9087,104 +9087,140 @@ def get_tensor_from_selected_rows(x, name=None):
    return out


-@templatedoc()
-def py_func(func, x, out, backward_func=None):
-    """
-    """
-
-    class PyFuncRegister(object):
-        _main_program_to_register = dict()
-
-        @classmethod
-        def get_instance(cls, prog):
-            if not isinstance(prog, Program):
-                raise TypeError("prog must be type of Program")
-
-            ret = cls._main_program_to_register.get(prog, None)
-            if ret is None:
-                ret = PyFuncRegister()
-                ret._idx = core.append_python_callable_object_and_return_id(ret)
-                ret._token_func_dict = dict()
-                ret._func_token_dict = dict()
-                cls._main_program_to_register[prog] = ret
-
-            return ret
-
-        @property
-        def handle_idx(self):
-            return self._idx
-
-        def unique_token(self, func):
-            return self._register_func(func)
-
-        def _register_func(self, func):
-            if func is None:
-                raise ValueError("func cannot be None")
-
-            token = self._func_token_dict.get(func, None)
-            if token is not None:
-                return token
-
-            token = unique_name.generate('py_func_op_token')
-            self._token_func_dict[token] = func
-            self._func_token_dict[func] = token
-            return token
-
-        def __call__(self, token, *args):
-            func = self._token_func_dict.get(token, None)
-            if func is None:
-                raise ValueError("func has not been registered")
-
-            arg_list = inspect.getargspec(func)
-            kwargs = dict()
-            idx = 0
-            for arg in arg_list[0]:
-                kwargs[arg] = args[idx]
-                idx += 1
-
-            args = args[idx:]
-            ret0 = func(*args, **kwargs)
-            if ret0 is None:
-                return None
-
-            if not isinstance(ret0, (list, tuple)):
-                ret0 = (ret0, )
-
-            ret = []
-            for i in six.moves.range(len(ret0)):
-                if ret0[i] is None:
-                    ret.append(None)
-                    continue
-
-                if isinstance(ret0[i], core.LoDTensor):
-                    ret.append(ret0[i])
-                    continue
+class PyFuncWrapper(object):
+    _register_funcs = []
+
+    def __init__(self, func):
+        if func is None or not hasattr(func, '__call__'):
+            raise TypeError('func must be a Python function')
+
+        self._func = func
+        # find named args using reflection 
+        self._named_args = inspect.getargspec(self._func)[0]
+        self._id = core.append_python_callable_object_and_return_id(self)
+        '''
+        Why record self here?
+
+        1. For debug usage. Users can call 
+           :code:`py_func.registered_func(idx)` method 
+           to find the registered function coresponding
+           to :code:`idx`. 
+
+        2. For increasing reference count of self. 
+           It seems that to release Python object 
+           whose reference count is 1 would cause
+           segmentation fault error in C++ side. 
+           May be lack of Python GC in C++ side?
+        '''
+        PyFuncWrapper._register_funcs.append(self)
+
+    @classmethod
+    def registered_func(cls, idx):
+        return cls._register_funcs[idx]._func
+
+    @classmethod
+    def registered_func_num(cls):
+        return len(cls._register_funcs)
+
+    @property
+    def id(self):
+        return self._id
+
+    def __call__(self, *args):
+        kwargs = dict()
+        idx = 0
+        for arg in self._named_args:
+            kwargs[arg] = args[idx]
+            idx += 1
+
+        ret0 = self._func(*args[idx:], **kwargs)
+        if ret0 is None:
+            return None
+
+        if not isinstance(ret0, (list, tuple)):
+            ret0 = (ret0, )
+
+        ret = []
+        for i in six.moves.range(len(ret0)):
+            if ret0[i] is None:
+                ret.append(None)
+                continue
+
+            if isinstance(ret0[i], core.LoDTensor):
+                ret.append(ret0[i])
+                continue
+
+            if isinstance(ret0[i], np.ndarray):
+                r = ret0[i]
+            else:
+                r = np.array(ret0[i])

-                if isinstance(ret0[i], np.ndarray):
-                    r = ret0[i]
-                else:
-                    r = np.array(ret0[i])
+            t = core.LoDTensor()
+            t.set(r, core.CPUPlace())
+            ret.append(t)

-                t = core.LoDTensor()
-                t.set(r, core.CPUPlace())
-                ret.append(t)
+        return tuple(ret)

-            return tuple(ret)

+@templatedoc()
+def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
+    """
+    PyFunc Operator.
+    
+    User can use :code:`py_func` to register operators in Python side.
+    The inputs of :code:`func` is :code:`LoDTensor` and outputs can be
+    numpy array or :code:`LoDTensor`. Paddle would call the registered
+    :code:`func` in forward part, and call :code:`backward_func` in
+    backward part (if :code:`backward_func` is not None).
+
+    User should set the right data type and shape of :code:`out` before
+    calling this function. However, data types and shapes of gradients of
+    :code:`out` and :code:`x` would be infered automatically.
+
+    The orders of inputs of :code:`backward_func` would be: forward input
+    :code:`x`, forward output :code:`out` and backward input gradient of
+    :code:`out`. If some variables of :code:`out` have no gradient, the input
+    tensor would be None in Python side. If some variables of :code:`in` have
+    no gradient, users should return None.
+
+    Args:
+        func (callable): forward Python function.
+        x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`.
+        out (Variable|list(Variable)|tuple(Variable)): outputs of :code:`func`.
+            Paddle cannot infer shapes and data types of :code:`out`. Users
+            should create :code:`out` beforehand. 
+        backward_func (callable|None): backward Python function.
+                                       None means no backward. Default None. 
+        skip_vars_in_backward_input (Variable|list(Variable)|tuple(Variable)):
+            Variables that are not needed in :code:`backward_func` inputs. 
+            These variables must be any of :code:`x` and :code:`out`.
+            If set, these vars would not be inputs of :code:`backward_func`,
+            Only useful when :code:`backward_func` is not None. Default None. 
+
+    Returns:
+        out (Variable|list(Variable)|tuple(Variable)): input :code:`out`
+    """
    helper = LayerHelper('py_func', **locals())
-    if isinstance(x, Variable):
+    if x is None:
+        x = []
+    elif isinstance(x, Variable):
        x = [x]
+    elif not isinstance(x, (list, tuple)):
+        raise TypeError('Input must be Variable/list(Variable)/tuple(Variable)')

-    if isinstance(out, Variable):
+    if out is None:
+        out_list = []
+    elif isinstance(out, Variable):
        out_list = [out]
-    else:
+    elif isinstance(out, (list, tuple)):
        out_list = out
+    else:
+        raise TypeError(
+            'Output must be Variable/list(Variable)/tuple(Variable)')

-    if func is None or not hasattr(func, '__call__'):
-        raise TypeError('Input func must be a function')
-
-    if backward_func is not None and not hasattr(backward_func, '__call__'):
-        raise TypeError('Input backward_func must be a function')
+    fwd_func_id = PyFuncWrapper(func).id
+    bwd_func_id = PyFuncWrapper(
+        backward_func).id if backward_func is not None else -1

    for each_out in out_list:
        if len(each_out.shape) == 0:
@@ -9192,18 +9228,34 @@ def py_func(func, x, out, backward_func=None):
                'Output shapes of py_func op should be provided by users manually'
            )

-    py_func_reg = PyFuncRegister.get_instance(helper.main_program)
-    forward_token = py_func_reg.unique_token(func)
-    backward_token = py_func_reg.unique_token(
-        backward_func) if backward_func is not None else ''
+    backward_skip_vars = set()
+    if backward_func is not None and skip_vars_in_backward_input is not None:
+        if isinstance(skip_vars_in_backward_input, Variable):
+            skip_vars_in_backward_input = [skip_vars_in_backward_input]
+
+        fwd_in_out = [v.name for v in x]
+        fwd_in_out.extend([v.name for v in out_list])
+        fwd_in_out = set(fwd_in_out)
+        backward_skip_vars = set()
+        for v in skip_vars_in_backward_input:
+            if not v.name in fwd_in_out:
+                raise ValueError(
+                    'Variable {} is not found in forward inputs and outputs'
+                    .format(v.name))
+            backward_skip_vars.add(v.name)

    helper.append_op(
        type='py_func',
        inputs={'X': x},
        outputs={'Out': out_list},
        attrs={
-            'handle_idx': py_func_reg.handle_idx,
-            'token': forward_token,
-            'backward_token': backward_token
+            'forward_callable_id': fwd_func_id,
+            'backward_callable_id': bwd_func_id,
+            'backward_skip_vars': list(backward_skip_vars)
        })
    return out
+
+
+# For debug usage
+py_func.registered_func = PyFuncWrapper.registered_func
+py_func.registered_func_num = PyFuncWrapper.registered_func_num
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle
+import unittest
+import six
+import numpy as np
+
+
+def tanh(x):
+    return np.tanh(x)
+
+
+def tanh_grad(y, dy):
+    return np.array(dy) * (1 - np.square(np.array(y)))
+
+
+def cross_entropy(logits, labels):
+    logits = np.array(logits)
+    labels = np.array(labels)
+    M = logits.shape[0]
+    N = logits.shape[1]
+    ret = np.ndarray([M, 1]).astype(logits.dtype)
+    for idx in six.moves.range(M):
+        ret[idx][0] = -np.log(logits[idx][labels[idx][0]])
+    return ret
+
+
+def cross_entropy_grad(logits, labels, bwd_dout):
+    logits = np.array(logits)
+    labels = np.array(labels)
+    bwd_dout = np.array(bwd_dout)
+    M = logits.shape[0]
+    N = logits.shape[1]
+    dlogits = np.zeros([M, N]).astype(logits.dtype)
+    for idx in six.moves.range(M):
+        dlogits[idx][labels[idx][0]] = -bwd_dout[idx] / logits[idx][labels[idx][
+            0]]
+    return dlogits, None
+
+
+def simple_fc_net(img, label, use_py_func_op):
+    hidden = img
+    for idx in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+        if use_py_func_op:
+            hidden = fluid.layers.tanh(hidden)
+        else:
+            new_hidden = fluid.default_main_program().current_block(
+            ).create_var(
+                name='hidden_{}'.format(idx),
+                dtype='float32',
+                shape=hidden.shape)
+            hidden = fluid.layers.py_func(
+                func=tanh,
+                x=hidden,
+                out=new_hidden,
+                backward_func=tanh_grad,
+                skip_vars_in_backward_input=hidden)
+
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    if not use_py_func_op:
+        loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    else:
+        loss = fluid.default_main_program().current_block().create_var(
+            name='loss', dtype='float32', shape=[-1, 1])
+        fluid.layers.py_func(
+            func=cross_entropy,
+            x=[prediction, label],
+            out=loss,
+            backward_func=cross_entropy_grad,
+            skip_vars_in_backward_input=loss)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def reader():
+    for _ in six.moves.range(100):
+        yield np.random.random([784]), np.random.random_integers(
+            size=[1], low=0, high=9)
+
+
+def test_main(use_cuda, use_py_func_op):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return None
+
+    with fluid.program_guard(fluid.Program(), fluid.Program()):
+        with fluid.scope_guard(fluid.core.Scope()):
+            fluid.default_main_program().random_seed = 1
+            fluid.default_startup_program().random_seed = 1
+            np.random.seed(1)
+
+            img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            loss = simple_fc_net(img, label, use_py_func_op)
+            optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
+            optimizer.minimize(loss)
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+            r = paddle.batch(reader, batch_size=10)
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            ret = []
+            for epoch_id in six.moves.range(2):
+                for d in r():
+                    L, = exe.run(feed=feeder.feed(d), fetch_list=[loss])
+                    ret.append(L[0])
+
+            return np.array(ret)
+
+
+class TestPyFuncOp(unittest.TestCase):
+    def test_loss_diff(self):
+        losses = []
+        for use_cuda in [True, False]:
+            for use_py_func_op in [True, False]:
+                L = test_main(use_cuda, use_py_func_op)
+                if L is not None:
+                    losses.append(L)
+
+        for idx in six.moves.range(len(losses) - 1):
+            max_diff = np.max(np.abs(losses[idx] - losses[0]))
+            self.assertAlmostEqual(max_diff, 0, delta=1e-3)
+
+
+if __name__ == '__main__':
+    unittest.main()