[NPU] add beam_search npu op (#34860)

* add beam_search npu op * fix CMakeList and add unittest * fix bug of beam search npu op * fix unittest * let input ids become int64 * set output ids to int64_t * delete check_dygraph * fix beam_width=1

[NPU] add beam_search npu op (#34860)
* add beam_search npu op * fix CMakeList and add unittest * fix bug of beam search npu op * fix unittest * let input ids become int64 * set output ids to int64_t * delete check_dygraph * fix beam_width=1
3760be06 · pangyoki · GitHub · 9f588cc2 · 3760be06 · 3760be06
6 changed file
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -51,11 +51,11 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_NOT_NULL(
        selected_ids,
        platform::errors::NotFound(
-            "Output(selected_scores) of BeamSearchOp is not found."));
+            "Output(selected_ids) of BeamSearchOp is not found."));
    PADDLE_ENFORCE_NOT_NULL(
        selected_scores,
        platform::errors::NotFound(
-            "Output(parent_idx) of BeamSearchOp is not found."));
+            "Output(selected_scores) of BeamSearchOp is not found."));

    math::BeamSearchFunctor<DeviceContext, T> alg;
    alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,

--- a/paddle/fluid/operators/beam_search_op_npu.cc
+++ b/paddle/fluid/operators/beam_search_op_npu.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/beam_search_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    beam_search,
+    ops::BeamSearchOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::BeamSearchOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::BeamSearchOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::BeamSearchOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -39,6 +39,10 @@ function(math_library TARGET)
    endif()
 endfunction()

+if (WITH_ASCEND_CL)
+  cc_library(beam_search_npu SRCS beam_search_npu.cc DEPS npu_op_runner)
+endif()
+
 # please add new math_library in alphabetical order
 math_library(concat_and_split)
 math_library(context_project DEPS im2col math_function)
@@ -68,7 +72,11 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function jit_kernel_helper)
-math_library(beam_search DEPS math_function)
+if (WITH_ASCEND_CL)
+    math_library(beam_search DEPS math_function beam_search_npu)
+else()
+    math_library(beam_search DEPS math_function)
+endif()
 math_library(fc DEPS blas)

 math_library(matrix_bit_code)

--- a/paddle/fluid/operators/math/beam_search_npu.cc
+++ b/paddle/fluid/operators/math/beam_search_npu.cc
--- a/python/paddle/fluid/tests/unittests/npu/test_beam_search_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_beam_search_op_npu.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+class TestBeamSearchNPUOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "beam_search"
+        self.init_data()
+        self.inputs = {
+            'pre_ids': (self.pre_ids, self.lod),
+            'pre_scores': (self.pre_score, self.lod),
+            'ids': (self.ids, self.lod),
+            'scores': (self.score, self.lod)
+        }
+        # The `target_lod` attribute is still based on offset
+        self.attrs = {
+            'level': 0,
+            'beam_size': self.beam_size,
+            'end_id': 0,
+            'is_accumulated': self.is_accumulated
+        }
+        self.outputs = {
+            'selected_ids': (self.selected_ids, self.out_lod),
+            'selected_scores': (self.selected_scores, self.out_lod),
+            'parent_idx': self.parent_idx
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_data(self):
+        self.beam_size = 2
+        self.is_accumulated = True
+        self.pre_ids = np.array([[1], [2], [3], [4]], dtype='int64')
+        self.ids = np.array(
+            [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
+        self.lod = [[2, 2], [1, 1, 1, 1]]
+        self.out_lod = [[2, 2], [1, 1, 1, 1]]
+        self.offset_lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        self.score = np.array(
+            [
+                [0.5, 0.3, 0.2],
+                [0.6, 0.3, 0.1],
+                [0.9, 0.5, 0.1],
+                [0.7, 0.5, 0.1],
+            ],
+            dtype='float32')
+        self.pre_score = np.array([[0.1], [0.2], [0.3], [0.4]], dtype='float32')
+        self.selected_ids = np.array([4, 2, 3, 8])[:, np.newaxis]
+        self.selected_scores = np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]
+        self.parent_idx = np.array([0, 1, 2, 3])
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+class TestBeamSearchNPUOp2(TestBeamSearchNPUOp):
+    def init_data(self):
+        self.beam_size = 2
+        self.is_accumulated = True
+        self.pre_ids = np.array([[1], [2], [3], [4]], dtype='int64')
+        self.ids = np.array([[4, 2], [7, 3], [3, 5], [8, 1]], dtype='int64')
+        self.lod = [[2, 2], [1, 1, 1, 1]]
+        self.out_lod = [[2, 2], [2, 0, 1, 1]]
+        self.offset_lod = [[0, 2, 4], [0, 2, 2, 3, 4]]
+        self.score = np.array(
+            [
+                [0.6, 0.9],
+                [0.5, 0.3],
+                [0.9, 0.5],
+                [0.1, 0.7],
+            ], dtype='float32')
+        self.pre_score = np.array([[0.1], [0.2], [0.3], [0.4]], dtype='float32')
+        self.selected_ids = np.array([4, 2, 3, 1])[:, np.newaxis]
+        self.selected_scores = np.array([0.6, 0.9, 0.9, 0.7])[:, np.newaxis]
+        self.parent_idx = np.array([0, 0, 2, 3])
+
+
+class TestBeamSearchNPUOp3(TestBeamSearchNPUOp):
+    def init_data(self):
+        # end_id = 0
+        self.beam_size = 2
+        self.is_accumulated = True
+        self.pre_ids = np.array([[1], [0], [0], [4]], dtype='int64')
+        self.ids = np.array([[4, 2], [7, 3], [3, 5], [8, 1]], dtype='int64')
+        self.lod = [[2, 2], [1, 1, 1, 1]]
+        self.out_lod = [[2, 2], [1, 1, 0, 2]]
+        self.offset_lod = [[0, 2, 4], [0, 1, 2, 2, 4]]
+        self.score = np.array(
+            [
+                [0.6, 0.9],
+                [0.5, 0.3],
+                [0.9, 0.5],
+                [0.6, 0.7],
+            ], dtype='float32')
+        self.pre_score = np.array([[0.1], [1.2], [0.5], [0.4]], dtype='float32')
+        self.selected_ids = np.array([2, 0, 8, 1])[:, np.newaxis]
+        self.selected_scores = np.array([0.9, 1.2, 0.6, 0.7])[:, np.newaxis]
+        self.parent_idx = np.array([0, 1, 3, 3])
+
+
+class TestBeamSearchNPUOp4(TestBeamSearchNPUOp):
+    def init_data(self):
+        # is_accumulated = False
+        self.beam_size = 2
+        self.is_accumulated = False
+        self.pre_ids = np.array([[1], [2], [3], [4]], dtype='int64')
+        self.ids = np.array([[4, 2], [7, 3], [3, 5], [8, 1]], dtype='int64')
+        self.lod = [[2, 2], [1, 1, 1, 1]]
+        self.out_lod = [[2, 2], [0, 2, 1, 1]]
+        self.offset_lod = [[0, 2, 4], [0, 0, 2, 3, 4]]
+        self.score = np.array(
+            [
+                [0.6, 0.9],
+                [0.5, 0.3],
+                [0.9, 0.5],
+                [0.1, 0.7],
+            ], dtype='float32')
+        self.pre_score = np.array([[0.1], [2.2], [0.3], [0.4]], dtype='float32')
+        self.selected_ids = np.array([7, 3, 3, 1])[:, np.newaxis]
+        self.selected_scores = np.array(
+            [1.50685, 0.996027, 0.194639, 0.043325])[:, np.newaxis]
+        self.parent_idx = np.array([1, 1, 2, 3])
+
+
+class TestBeamSearchNPUOp5(TestBeamSearchNPUOp):
+    def init_data(self):
+        # beam_size = 1
+        self.beam_size = 1
+        self.is_accumulated = True
+        self.pre_ids = np.array([[1], [2], [3], [4]], dtype='int64')
+        self.ids = np.array([[4, 2], [7, 3], [3, 5], [8, 1]], dtype='int64')
+        self.lod = [[1, 1, 1, 1], [1, 1, 1, 1]]
+        self.out_lod = [[1, 1, 1, 1], [1, 1, 1, 1]]
+        self.offset_lod = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]
+        self.score = np.array(
+            [
+                [0.6, 0.9],
+                [0.5, 0.3],
+                [0.9, 0.5],
+                [0.1, 0.7],
+            ], dtype='float32')
+        self.pre_score = np.array([[0.1], [0.2], [0.3], [0.4]], dtype='float32')
+        self.selected_ids = np.array([2, 7, 3, 1])[:, np.newaxis]
+        self.selected_scores = np.array([0.9, 0.5, 0.9, 0.7])[:, np.newaxis]
+        self.parent_idx = np.array([0, 1, 2, 3])
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -38,6 +38,7 @@ class BeamSearchOpTester(unittest.TestCase):
        self._create_pre_scores()
        self._create_scores()
        self._create_pre_ids()
+        self.set_outputs()
        self.scope.var('selected_ids').get_tensor()
        self.scope.var('selected_scores').get_tensor()
        self.scope.var('parent_idx').get_tensor()
@@ -53,22 +54,19 @@ class BeamSearchOpTester(unittest.TestCase):
            selected_scores='selected_scores',
            parent_idx='parent_idx',
            level=0,
-            beam_size=2,
-            end_id=0, )
+            beam_size=self.beam_size,
+            end_id=0,
+            is_accumulated=self.is_accumulated)
        op.run(self.scope, core.CPUPlace())
        selected_ids = self.scope.find_var("selected_ids").get_tensor()
        selected_scores = self.scope.find_var("selected_scores").get_tensor()
        parent_idx = self.scope.find_var("parent_idx").get_tensor()
+        self.assertTrue(np.allclose(np.array(selected_ids), self.output_ids))
        self.assertTrue(
-            np.allclose(
-                np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
+            np.allclose(np.array(selected_scores), self.output_scores))
+        self.assertEqual(selected_ids.lod(), self.output_lod)
        self.assertTrue(
-            np.allclose(
-                np.array(selected_scores),
-                np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
-        self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])
-        self.assertTrue(
-            np.allclose(np.array(parent_idx), np.array([0, 1, 2, 3])))
+            np.allclose(np.array(parent_idx), self.output_parent_idx))

    def _create_pre_ids(self):
        np_data = np.array([[1, 2, 3, 4]], dtype='int64')
@@ -97,6 +95,194 @@ class BeamSearchOpTester(unittest.TestCase):
        tensor = create_tensor(self.scope, "scores", np_data)
        tensor.set_lod(self.lod)

+    def set_outputs(self):
+        self.beam_size = 2
+        self.is_accumulated = True
+        self.output_ids = np.array([4, 2, 3, 8])[:, np.newaxis]
+        self.output_scores = np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]
+        self.output_lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        self.output_parent_idx = np.array([0, 1, 2, 3])
+
+
+class BeamSearchOpTester2(BeamSearchOpTester):
+    def _create_pre_ids(self):
+        np_data = np.array([[1], [2], [3], [4]], dtype='int64')
+        tensor = create_tensor(self.scope, 'pre_ids', np_data)
+
+    def _create_pre_scores(self):
+        np_data = np.array([[0.1, 0.2, 0.3, 0.4]], dtype='float32')
+        tensor = create_tensor(self.scope, 'pre_scores', np_data)
+
+    def _create_ids(self):
+        self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        np_data = np.array([[4, 2], [7, 3], [3, 5], [8, 1]], dtype='int64')
+        tensor = create_tensor(self.scope, "ids", np_data)
+        tensor.set_lod(self.lod)
+
+    def _create_scores(self):
+        np_data = np.array(
+            [
+                [0.6, 0.9],
+                [0.5, 0.3],
+                [0.9, 0.5],
+                [0.1, 0.7],
+            ], dtype='float32')
+        tensor = create_tensor(self.scope, "scores", np_data)
+        tensor.set_lod(self.lod)
+
+    def set_outputs(self):
+        self.beam_size = 2
+        self.is_accumulated = True
+        self.output_ids = np.array([2, 4, 3, 1])[:, np.newaxis]
+        self.output_scores = np.array([0.9, 0.6, 0.9, 0.7])[:, np.newaxis]
+        self.output_lod = [[0, 2, 4], [0, 2, 2, 3, 4]]
+        self.output_parent_idx = np.array([0, 0, 2, 3])
+
+
+class BeamSearchOpTester3(BeamSearchOpTester):
+    # pre_id = end_id
+    def _create_pre_ids(self):
+        np_data = np.array([[1], [0], [0], [4]], dtype='int64')
+        tensor = create_tensor(self.scope, 'pre_ids', np_data)
+
+    def _create_pre_scores(self):
+        np_data = np.array([[0.1], [1.2], [0.5], [0.4]], dtype='float32')
+        tensor = create_tensor(self.scope, 'pre_scores', np_data)
+
+    def _create_ids(self):
+        self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        np_data = np.array([[4, 2], [7, 3], [3, 5], [8, 1]], dtype='int64')
+        tensor = create_tensor(self.scope, "ids", np_data)
+        tensor.set_lod(self.lod)
+
+    def _create_scores(self):
+        np_data = np.array(
+            [
+                [0.6, 0.9],
+                [0.5, 0.3],
+                [0.9, 0.5],
+                [0.6, 0.7],
+            ], dtype='float32')
+        tensor = create_tensor(self.scope, "scores", np_data)
+        tensor.set_lod(self.lod)
+
+    def set_outputs(self):
+        self.beam_size = 2
+        self.is_accumulated = True
+        self.output_ids = np.array([2, 0, 1, 8])[:, np.newaxis]
+        self.output_scores = np.array([0.9, 1.2, 0.7, 0.6])[:, np.newaxis]
+        self.output_lod = [[0, 2, 4], [0, 1, 2, 2, 4]]
+        self.output_parent_idx = np.array([0, 1, 3, 3])
+
+
+class BeamSearchOpTester4(BeamSearchOpTester):
+    # prune beam search while pre_id of in all beams is end_id
+    def _create_pre_ids(self):
+        np_data = np.array([[0], [0], [0], [4]], dtype='int64')
+        tensor = create_tensor(self.scope, 'pre_ids', np_data)
+
+    def _create_pre_scores(self):
+        np_data = np.array([[0.1], [1.2], [0.5], [0.4]], dtype='float32')
+        tensor = create_tensor(self.scope, 'pre_scores', np_data)
+
+    def _create_ids(self):
+        self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        np_data = np.array([[4, 2], [7, 3], [3, 5], [8, 1]], dtype='int64')
+        tensor = create_tensor(self.scope, "ids", np_data)
+        tensor.set_lod(self.lod)
+
+    def _create_scores(self):
+        np_data = np.array(
+            [
+                [0.6, 0.9],
+                [0.5, 0.3],
+                [0.9, 0.5],
+                [0.6, 0.7],
+            ], dtype='float32')
+        tensor = create_tensor(self.scope, "scores", np_data)
+        tensor.set_lod(self.lod)
+
+    def set_outputs(self):
+        self.beam_size = 2
+        self.is_accumulated = True
+        self.output_ids = np.array([1, 8])[:, np.newaxis]
+        self.output_scores = np.array([0.7, 0.6])[:, np.newaxis]
+        self.output_lod = [[0, 2, 4], [0, 0, 0, 0, 2]]
+        self.output_parent_idx = np.array([3, 3])
+
+
+class BeamSearchOpTester5(BeamSearchOpTester):
+    # is_accumulated = False
+    def _create_pre_ids(self):
+        np_data = np.array([[1], [2], [3], [4]], dtype='int64')
+        tensor = create_tensor(self.scope, 'pre_ids', np_data)
+
+    def _create_pre_scores(self):
+        np_data = np.array([[0.1, 2.2, 0.3, 0.4]], dtype='float32')
+        tensor = create_tensor(self.scope, 'pre_scores', np_data)
+
+    def _create_ids(self):
+        self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        np_data = np.array([[4, 2], [7, 3], [3, 5], [8, 1]], dtype='int64')
+        tensor = create_tensor(self.scope, "ids", np_data)
+        tensor.set_lod(self.lod)
+
+    def _create_scores(self):
+        np_data = np.array(
+            [
+                [0.6, 0.9],
+                [0.5, 0.3],
+                [0.9, 0.5],
+                [0.1, 0.7],
+            ], dtype='float32')
+        tensor = create_tensor(self.scope, "scores", np_data)
+        tensor.set_lod(self.lod)
+
+    def set_outputs(self):
+        self.beam_size = 2
+        self.is_accumulated = False
+        self.output_ids = np.array([7, 3, 3, 1])[:, np.newaxis]
+        self.output_scores = np.array(
+            [1.50685, 0.996027, 0.194639, 0.043325])[:, np.newaxis]
+        self.output_lod = [[0, 2, 4], [0, 0, 2, 3, 4]]
+        self.output_parent_idx = np.array([1, 1, 2, 3])
+
+
+class BeamSearchOpTester6(BeamSearchOpTester):
+    # beam_size = 1
+    def _create_pre_ids(self):
+        np_data = np.array([[1], [2], [3], [4]], dtype='int64')
+        tensor = create_tensor(self.scope, 'pre_ids', np_data)
+
+    def _create_pre_scores(self):
+        np_data = np.array([[0.1, 0.2, 0.3, 0.4]], dtype='float32')
+        tensor = create_tensor(self.scope, 'pre_scores', np_data)
+
+    def _create_ids(self):
+        self.lod = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]
+        np_data = np.array([[4, 2], [7, 3], [3, 5], [8, 1]], dtype='int64')
+        tensor = create_tensor(self.scope, "ids", np_data)
+        tensor.set_lod(self.lod)
+
+    def _create_scores(self):
+        np_data = np.array(
+            [
+                [0.6, 0.9],
+                [0.5, 0.3],
+                [0.9, 0.5],
+                [0.1, 0.7],
+            ], dtype='float32')
+        tensor = create_tensor(self.scope, "scores", np_data)
+        tensor.set_lod(self.lod)
+
+    def set_outputs(self):
+        self.beam_size = 1
+        self.is_accumulated = True
+        self.output_ids = np.array([2, 7, 3, 1])[:, np.newaxis]
+        self.output_scores = np.array([0.9, 0.5, 0.9, 0.7])[:, np.newaxis]
+        self.output_lod = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]
+        self.output_parent_idx = np.array([0, 1, 2, 3])
+

 class TestBeamSearchOpError(unittest.TestCase):
    def test_errors(self):