Make While Op could run on GPU place and add while_loop unittest (#21672)

1. Make while_op accept GPU conditional data 2. Add more complex test cases for while_loop API

Make While Op could run on GPU place and add while_loop unittest (#21672)
1. Make while_op accept GPU conditional data 2. Add more complex test cases for while_loop API
8b7c50f4 · guofei · Huihuang Zheng · 17299b8d · 8b7c50f4 · 8b7c50f4
4 changed file
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -74,25 +74,26 @@ class WhileOp : public framework::OperatorBase {
    }

    PADDLE_ENFORCE_EQ(step_scopes->size(), 0, "The StepScope should be empty.");
-    PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
-                   "Condition of while op must in CPU memory.");

+    bool cond_data = GetCondData(cond);
    bool is_test = Attr<bool>("is_test");
    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);

    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
    if (!is_test) {
-      while (cond.data<bool>()[0]) {
+      while (cond_data) {
        auto &current_scope = scope.NewScope();
        step_scopes->push_back(&current_scope);
        executor.RunPreparedContext(ctx.get(), &current_scope, false, true,
                                    true);
+        cond_data =
+            GetCondData(scope.FindVar(Input(kCondition))->Get<LoDTensor>());
      }
    } else {
      auto &current_scope = scope.NewScope();
      executor.CreateVariables(*program, &current_scope, block->ID());
-      while (cond.data<bool>()[0]) {
+      while (cond_data) {
        for (auto &name : current_scope.LocalVarNames()) {
          auto *var = current_scope.Var(name);
          if (var->IsType<framework::LoDTensor>()) {
@@ -108,6 +109,8 @@ class WhileOp : public framework::OperatorBase {
        }
        executor.RunPreparedContext(ctx.get(), &current_scope, false, false,
                                    false);
+        cond_data =
+            GetCondData(scope.FindVar(Input(kCondition))->Get<LoDTensor>());
      }
      scope.DeleteScope(&current_scope);
    }

--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -17,9 +17,10 @@
 #include <string>
 #include <unordered_set>
 #include <utility>
-
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/string/string_helper.h"

 namespace paddle {
@@ -196,5 +197,22 @@ void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
                                                      &bwd_ops);
 }

+// Make while_op could run on GPU place
+bool GetCondData(const framework::LoDTensor &cond) {
+  if (platform::is_cpu_place(cond.place())) {
+    return cond.data<bool>()[0];
+  }
+  // when platform::is_gpu_place(cond.place()) is true
+  std::unique_ptr<framework::LoDTensor> cpu_cond{new framework::LoDTensor()};
+#ifdef PADDLE_WITH_CUDA
+  framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "This version of PaddlePaddle doen NOT support GPU but got GPU tensor "
+      "Cond in WhileOp. Please compile WITH_GPU option"));
+#endif
+  return cpu_cond->data<bool>()[0];
+}
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -40,5 +40,7 @@ void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
    const std::vector<framework::OperatorBase *> &while_ops,
    const std::vector<framework::OperatorBase *> &while_grad_ops);

+bool GetCondData(const framework::LoDTensor &cond);
+
 }  // namespace operators
 }  // namespace paddle
--- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
@@ -22,6 +22,7 @@ import paddle.fluid.layers as layers
 import paddle.fluid.framework as framework
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.backward import append_backward


 class TestApiWhileLoop(unittest.TestCase):
@@ -40,7 +41,8 @@ class TestApiWhileLoop(unittest.TestCase):
            ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
            out = layers.while_loop(cond, body, (i, ))

-        place = fluid.CPUPlace()
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
        exe = fluid.Executor(place)
        res = exe.run(main_program, fetch_list=out)
        self.assertTrue(
@@ -60,14 +62,19 @@ class TestApiWhileLoop(unittest.TestCase):
        with program_guard(main_program, startup_program):
            i = layers.zeros(shape=[1], dtype='int64')
            ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
-            mem = layers.data(name="mem", shape=[10], dtype='float32')
+            mem = layers.data(
+                name='mem',
+                shape=[10],
+                dtype='float32',
+                append_batch_size=False)
            one = layers.fill_constant(shape=[10], dtype='float32', value=1)
            out = layers.while_loop(cond, body, [i, mem])

            data = np.random.rand(10).astype('float32')
            data_one = np.ones(10).astype('float32')

-        place = fluid.CPUPlace()
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
        exe = fluid.Executor(place)
        res = exe.run(main_program, feed={'mem': data}, fetch_list=out)
        for i in range(10):
@@ -104,30 +111,183 @@ class TestApiWhileLoop_Nested(unittest.TestCase):
        with program_guard(main_program, startup_program):
            i = layers.zeros(shape=[1], dtype='int64')
            j = layers.zeros(shape=[1], dtype='int64')
-            init = layers.data(name="init", shape=[3, 3], dtype='float32')
-            sums = layers.data(name="sums", shape=[3, 3], dtype='float32')
+            init = layers.data(
+                name='init',
+                shape=[3, 3],
+                dtype='float32',
+                append_batch_size=False)
+            sums = layers.data(
+                name='sums',
+                shape=[3, 3],
+                dtype='float32',
+                append_batch_size=False)
            loop_len1 = layers.fill_constant(shape=[1], dtype='int64', value=2)
            loop_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
            ones = layers.fill_constant(shape=[3, 3], dtype='float32', value=1)

-            res = layers.while_loop(external_cond, external_body,
+            out = layers.while_loop(external_cond, external_body,
                                    [i, j, init, sums])

            data = np.random.rand(3, 3).astype('float32')
            data_sums = np.zeros([3, 3]).astype('float32')

-        place = fluid.CPUPlace()
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
        exe = fluid.Executor(place)
-        ret = exe.run(main_program,
+        res = exe.run(main_program,
                      feed={'init': data,
                            'sums': data_sums},
-                      fetch_list=res)
+                      fetch_list=out)
        for i in range(3):
            data = np.add(data, 1)
            data_sums = np.add(data, data_sums)
        for j in range(2):
            data_sums = np.add(data, data_sums)
-        self.assertTrue(np.allclose(np.asarray(ret[3]), data_sums))
+        self.assertTrue(np.allclose(np.asarray(res[3]), data_sums))
+
+
+class TestApiWhileLoop_Backward(unittest.TestCase):
+    def test_while_loop_backward(self):
+        def cond(i, x):
+            return layers.less_than(i, eleven)
+
+        def body(i, x):
+            x = layers.elementwise_mul(x=i, y=i)
+            i = layers.increment(i)
+            return [i, x]
+
+        main_program = Program()
+        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
+            i = layers.data(
+                name='i', shape=[1], dtype='float32', append_batch_size=False)
+            i.stop_gradient = False
+            eleven = layers.fill_constant(shape=[1], dtype='float32', value=11)
+            one = layers.fill_constant(shape=[1], dtype='float32', value=1)
+            x = layers.data(
+                name='x', shape=[1], dtype='float32', append_batch_size=False)
+            x.stop_gradient = False
+
+            out = layers.while_loop(cond, body, [i, x])
+            mean = layers.mean(out[1])
+            append_backward(mean)
+
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        feed_i = np.ones(1).astype('float32')
+        feed_x = np.ones(1).astype('float32')
+        data = np.asarray([100]).astype('float32')
+        i_grad = np.asarray([110]).astype('float32')
+
+        res = exe.run(main_program,
+                      feed={'i': feed_i,
+                            'x': feed_x},
+                      fetch_list=[mean.name, i.grad_name])
+        self.assertTrue(np.allclose(np.asarray(res[0]), data))
+        self.assertTrue(np.allclose(np.asarray(res[1]), i_grad))
+
+
+class TestApiWhileLoop_NestedWithBackward(unittest.TestCase):
+    def test_nested_net_with_backward(self):
+        def external_cond(i, x, y):
+            return layers.less_than(i, ten)
+
+        def external_body(i, x, y):
+            def internal_cond(i, x, y):
+                return layers.less_than(i, five)
+
+            def internal_body(i, x, y):
+                x = layers.elementwise_add(x=i, y=i)
+                i = layers.increment(i)
+                return [i, x, y]
+
+            temp = layers.while_loop(internal_cond, internal_body, [i, x, y])
+            y = layers.elementwise_add(x=temp[1], y=i)
+            i = layers.increment(i)
+            return [i, x, y]
+
+        main_program = Program()
+        startup_program = Program()
+
+        with fluid.program_guard(main_program, startup_program):
+            i = layers.data(
+                name='i', shape=[1], dtype='float32', append_batch_size=False)
+            i.stop_gradient = False
+            ten = layers.fill_constant(shape=[1], dtype='float32', value=10)
+            five = layers.fill_constant(shape=[1], dtype='float32', value=5)
+            x = layers.data(
+                name='x', shape=[1], dtype='float32', append_batch_size=False)
+            x.stop_gradient = False
+            y = layers.data(
+                name='y', shape=[1], dtype='float32', append_batch_size=False)
+            y.stop_gradient = False
+            out = layers.while_loop(external_cond, external_body, [i, x, y])
+
+            mean = layers.mean(out[2])
+            append_backward(mean)
+
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        data = np.asarray([17]).astype('float32')
+        feed_x = np.zeros(1).astype('float32')
+        feed_i = np.ones(1).astype('float32')
+        feed_y = np.zeros(1).astype('float32')
+        i_grad = np.asarray(13).astype('int32')
+
+        res = exe.run(main_program,
+                      feed={'i': feed_i,
+                            'x': feed_x,
+                            'y': feed_y},
+                      fetch_list=[mean.name, i.grad_name])
+
+        self.assertTrue(np.allclose(np.asarray(res[0]), data))
+        self.assertTrue(np.allclose(np.asarray(res[1]), i_grad))
+
+
+class TestApiWhileLoopWithSwitchCase(unittest.TestCase):
+    def test_with_switch_case(self):
+        def cond(i):
+            return layers.less_than(i, ten)
+
+        def body(i):
+            def fn_add_three():
+                data_add_three = layers.elementwise_add(x=i, y=three)
+                return data_add_three
+
+            def fn_square():
+                data_mul_data = layers.elementwise_mul(x=i, y=i)
+                return data_mul_data
+
+            def fn_add_one():
+                data_add_one = layers.elementwise_add(x=i, y=one)
+                return data_add_one
+
+            return layers.switch_case(
+                branch_index=i,
+                branch_fns={2: fn_add_three,
+                            5: fn_square},
+                default=fn_add_one)
+
+        main_program = Program()
+        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
+            i = layers.fill_constant(shape=[1], dtype='int64', value=1)
+            ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
+            three = layers.fill_constant(shape=[1], dtype='int64', value=3)
+            one = layers.fill_constant(shape=[1], dtype='int64', value=1)
+            out = layers.while_loop(cond, body, [i])
+
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        res = exe.run(main_program, fetch_list=out)
+
+        data = np.asarray([25]).astype('int64')
+        self.assertTrue(np.allclose(np.asarray(res[0]), data))


 class TestApiWhileLoop_Error(unittest.TestCase):