From 8b7c50f49a16456d8e517c349c2cc1133078121b Mon Sep 17 00:00:00 2001
From: guofei <52460041+gfwm2013@users.noreply.github.com>
Date: Thu, 19 Dec 2019 11:16:07 +0800
Subject: [PATCH] Make While Op could run on GPU place and add while_loop
 unittest  (#21672)

1. Make while_op accept GPU conditional data
2. Add more complex test cases for while_loop API
---
 .../fluid/operators/controlflow/while_op.cc   |  11 +-
 .../operators/controlflow/while_op_helper.cc  |  20 +-
 .../operators/controlflow/while_op_helper.h   |   2 +
 .../tests/unittests/test_while_loop_op.py     | 180 +++++++++++++++++-
 4 files changed, 198 insertions(+), 15 deletions(-)
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 0953256b971..2e359412797 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -74,25 +74,26 @@ class WhileOp : public framework::OperatorBase {
     }
 
     PADDLE_ENFORCE_EQ(step_scopes->size(), 0, "The StepScope should be empty.");
-    PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
-                   "Condition of while op must in CPU memory.");
 
+    bool cond_data = GetCondData(cond);
     bool is_test = Attr<bool>("is_test");
     auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
     VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
 
     auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
     if (!is_test) {
-      while (cond.data<bool>()[0]) {
+      while (cond_data) {
         auto &current_scope = scope.NewScope();
         step_scopes->push_back(&current_scope);
         executor.RunPreparedContext(ctx.get(), &current_scope, false, true,
                                     true);
+        cond_data =
+            GetCondData(scope.FindVar(Input(kCondition))->Get<LoDTensor>());
       }
     } else {
       auto &current_scope = scope.NewScope();
       executor.CreateVariables(*program, &current_scope, block->ID());
-      while (cond.data<bool>()[0]) {
+      while (cond_data) {
         for (auto &name : current_scope.LocalVarNames()) {
           auto *var = current_scope.Var(name);
           if (var->IsType<framework::LoDTensor>()) {
@@ -108,6 +109,8 @@ class WhileOp : public framework::OperatorBase {
         }
         executor.RunPreparedContext(ctx.get(), &current_scope, false, false,
                                     false);
+        cond_data =
+            GetCondData(scope.FindVar(Input(kCondition))->Get<LoDTensor>());
       }
       scope.DeleteScope(&current_scope);
     }
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index 8f1e3f60927..e9a7dc43828 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -17,9 +17,10 @@
 #include <string>
 #include <unordered_set>
 #include <utility>
-
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
@@ -196,5 +197,22 @@ void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
                                                       &bwd_ops);
 }
 
+// Make while_op could run on GPU place
+bool GetCondData(const framework::LoDTensor &cond) {
+  if (platform::is_cpu_place(cond.place())) {
+    return cond.data<bool>()[0];
+  }
+  // when platform::is_gpu_place(cond.place()) is true
+  std::unique_ptr<framework::LoDTensor> cpu_cond{new framework::LoDTensor()};
+#ifdef PADDLE_WITH_CUDA
+  framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "This version of PaddlePaddle doen NOT support GPU but got GPU tensor "
+      "Cond in WhileOp. Please compile WITH_GPU option"));
+#endif
+  return cpu_cond->data<bool>()[0];
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
index e2cfece6580..4f9d93c91f6 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -40,5 +40,7 @@ void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
     const std::vector<framework::OperatorBase *> &while_ops,
     const std::vector<framework::OperatorBase *> &while_grad_ops);
 
+bool GetCondData(const framework::LoDTensor &cond);
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_while_loop_op.py b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
index 78ceb5250cf..dc74f650c5e 100644
--- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
@@ -22,6 +22,7 @@ import paddle.fluid.layers as layers
 import paddle.fluid.framework as framework
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.backward import append_backward
 
 
 class TestApiWhileLoop(unittest.TestCase):
@@ -40,7 +41,8 @@ class TestApiWhileLoop(unittest.TestCase):
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
             out = layers.while_loop(cond, body, (i, ))
 
-        place = fluid.CPUPlace()
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
         exe = fluid.Executor(place)
         res = exe.run(main_program, fetch_list=out)
         self.assertTrue(
@@ -60,14 +62,19 @@ class TestApiWhileLoop(unittest.TestCase):
         with program_guard(main_program, startup_program):
             i = layers.zeros(shape=[1], dtype='int64')
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
-            mem = layers.data(name="mem", shape=[10], dtype='float32')
+            mem = layers.data(
+                name='mem',
+                shape=[10],
+                dtype='float32',
+                append_batch_size=False)
             one = layers.fill_constant(shape=[10], dtype='float32', value=1)
             out = layers.while_loop(cond, body, [i, mem])
 
             data = np.random.rand(10).astype('float32')
             data_one = np.ones(10).astype('float32')
 
-        place = fluid.CPUPlace()
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
         exe = fluid.Executor(place)
         res = exe.run(main_program, feed={'mem': data}, fetch_list=out)
         for i in range(10):
@@ -104,30 +111,183 @@ class TestApiWhileLoop_Nested(unittest.TestCase):
         with program_guard(main_program, startup_program):
             i = layers.zeros(shape=[1], dtype='int64')
             j = layers.zeros(shape=[1], dtype='int64')
-            init = layers.data(name="init", shape=[3, 3], dtype='float32')
-            sums = layers.data(name="sums", shape=[3, 3], dtype='float32')
+            init = layers.data(
+                name='init',
+                shape=[3, 3],
+                dtype='float32',
+                append_batch_size=False)
+            sums = layers.data(
+                name='sums',
+                shape=[3, 3],
+                dtype='float32',
+                append_batch_size=False)
             loop_len1 = layers.fill_constant(shape=[1], dtype='int64', value=2)
             loop_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
             ones = layers.fill_constant(shape=[3, 3], dtype='float32', value=1)
 
-            res = layers.while_loop(external_cond, external_body,
+            out = layers.while_loop(external_cond, external_body,
                                     [i, j, init, sums])
 
             data = np.random.rand(3, 3).astype('float32')
             data_sums = np.zeros([3, 3]).astype('float32')
 
-        place = fluid.CPUPlace()
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
         exe = fluid.Executor(place)
-        ret = exe.run(main_program,
+        res = exe.run(main_program,
                       feed={'init': data,
                             'sums': data_sums},
-                      fetch_list=res)
+                      fetch_list=out)
         for i in range(3):
             data = np.add(data, 1)
             data_sums = np.add(data, data_sums)
         for j in range(2):
             data_sums = np.add(data, data_sums)
-        self.assertTrue(np.allclose(np.asarray(ret[3]), data_sums))
+        self.assertTrue(np.allclose(np.asarray(res[3]), data_sums))
+
+
+class TestApiWhileLoop_Backward(unittest.TestCase):
+    def test_while_loop_backward(self):
+        def cond(i, x):
+            return layers.less_than(i, eleven)
+
+        def body(i, x):
+            x = layers.elementwise_mul(x=i, y=i)
+            i = layers.increment(i)
+            return [i, x]
+
+        main_program = Program()
+        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
+            i = layers.data(
+                name='i', shape=[1], dtype='float32', append_batch_size=False)
+            i.stop_gradient = False
+            eleven = layers.fill_constant(shape=[1], dtype='float32', value=11)
+            one = layers.fill_constant(shape=[1], dtype='float32', value=1)
+            x = layers.data(
+                name='x', shape=[1], dtype='float32', append_batch_size=False)
+            x.stop_gradient = False
+
+            out = layers.while_loop(cond, body, [i, x])
+            mean = layers.mean(out[1])
+            append_backward(mean)
+
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        feed_i = np.ones(1).astype('float32')
+        feed_x = np.ones(1).astype('float32')
+        data = np.asarray([100]).astype('float32')
+        i_grad = np.asarray([110]).astype('float32')
+
+        res = exe.run(main_program,
+                      feed={'i': feed_i,
+                            'x': feed_x},
+                      fetch_list=[mean.name, i.grad_name])
+        self.assertTrue(np.allclose(np.asarray(res[0]), data))
+        self.assertTrue(np.allclose(np.asarray(res[1]), i_grad))
+
+
+class TestApiWhileLoop_NestedWithBackward(unittest.TestCase):
+    def test_nested_net_with_backward(self):
+        def external_cond(i, x, y):
+            return layers.less_than(i, ten)
+
+        def external_body(i, x, y):
+            def internal_cond(i, x, y):
+                return layers.less_than(i, five)
+
+            def internal_body(i, x, y):
+                x = layers.elementwise_add(x=i, y=i)
+                i = layers.increment(i)
+                return [i, x, y]
+
+            temp = layers.while_loop(internal_cond, internal_body, [i, x, y])
+            y = layers.elementwise_add(x=temp[1], y=i)
+            i = layers.increment(i)
+            return [i, x, y]
+
+        main_program = Program()
+        startup_program = Program()
+
+        with fluid.program_guard(main_program, startup_program):
+            i = layers.data(
+                name='i', shape=[1], dtype='float32', append_batch_size=False)
+            i.stop_gradient = False
+            ten = layers.fill_constant(shape=[1], dtype='float32', value=10)
+            five = layers.fill_constant(shape=[1], dtype='float32', value=5)
+            x = layers.data(
+                name='x', shape=[1], dtype='float32', append_batch_size=False)
+            x.stop_gradient = False
+            y = layers.data(
+                name='y', shape=[1], dtype='float32', append_batch_size=False)
+            y.stop_gradient = False
+            out = layers.while_loop(external_cond, external_body, [i, x, y])
+
+            mean = layers.mean(out[2])
+            append_backward(mean)
+
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        data = np.asarray([17]).astype('float32')
+        feed_x = np.zeros(1).astype('float32')
+        feed_i = np.ones(1).astype('float32')
+        feed_y = np.zeros(1).astype('float32')
+        i_grad = np.asarray(13).astype('int32')
+
+        res = exe.run(main_program,
+                      feed={'i': feed_i,
+                            'x': feed_x,
+                            'y': feed_y},
+                      fetch_list=[mean.name, i.grad_name])
+
+        self.assertTrue(np.allclose(np.asarray(res[0]), data))
+        self.assertTrue(np.allclose(np.asarray(res[1]), i_grad))
+
+
+class TestApiWhileLoopWithSwitchCase(unittest.TestCase):
+    def test_with_switch_case(self):
+        def cond(i):
+            return layers.less_than(i, ten)
+
+        def body(i):
+            def fn_add_three():
+                data_add_three = layers.elementwise_add(x=i, y=three)
+                return data_add_three
+
+            def fn_square():
+                data_mul_data = layers.elementwise_mul(x=i, y=i)
+                return data_mul_data
+
+            def fn_add_one():
+                data_add_one = layers.elementwise_add(x=i, y=one)
+                return data_add_one
+
+            return layers.switch_case(
+                branch_index=i,
+                branch_fns={2: fn_add_three,
+                            5: fn_square},
+                default=fn_add_one)
+
+        main_program = Program()
+        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
+            i = layers.fill_constant(shape=[1], dtype='int64', value=1)
+            ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
+            three = layers.fill_constant(shape=[1], dtype='int64', value=3)
+            one = layers.fill_constant(shape=[1], dtype='int64', value=1)
+            out = layers.while_loop(cond, body, [i])
+
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        res = exe.run(main_program, fetch_list=out)
+
+        data = np.asarray([25]).astype('int64')
+        self.assertTrue(np.allclose(np.asarray(res[0]), data))
 
 
 class TestApiWhileLoop_Error(unittest.TestCase):
-- 
GitLab