diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index 4bf4ba1120df0486c3f2ae287d9a5970fb423688..1590eed0bb2b0d794942b3eabcf9d05378e972e2 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -45,9 +45,15 @@ struct BeamSearchDecodeFunctor { id_tensor_(id_tensor), score_tensor_(score_tensor) { tensor_on_gpu_ = false; + tensor_on_npu_ = false; // First make a copy of GPU data on CPU - if (platform::is_gpu_place(step_ids_origin_[0].place())) { - tensor_on_gpu_ = true; + if (platform::is_gpu_place(step_ids_origin_[0].place()) || + platform::is_npu_place(step_ids_origin_[0].place())) { + if (platform::is_gpu_place(step_ids_origin_[0].place())) { + tensor_on_gpu_ = true; + } else { + tensor_on_npu_ = true; + } platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(step_ids_origin_[0].place()); @@ -55,7 +61,9 @@ struct BeamSearchDecodeFunctor { for (auto& step_id : step_ids_origin_) { framework::LoDTensor out; if (step_id.numel() > 0) { - dev_ctx->Wait(); + if (tensor_on_gpu_) { + dev_ctx->Wait(); + } framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out); dev_ctx->Wait(); } @@ -64,8 +72,13 @@ struct BeamSearchDecodeFunctor { step_ids_.push_back(out); } } - if (platform::is_gpu_place(step_scores_origin_[0].place())) { - tensor_on_gpu_ = true; + if (platform::is_gpu_place(step_scores_origin_[0].place()) || + platform::is_npu_place(step_scores_origin_[0].place())) { + if (platform::is_gpu_place(step_scores_origin_[0].place())) { + tensor_on_gpu_ = true; + } else { + tensor_on_npu_ = true; + } platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(step_scores_origin_[0].place()); @@ -73,7 +86,9 @@ struct BeamSearchDecodeFunctor { for (auto& step_score : step_scores_origin_) { framework::LoDTensor out; if (step_score.numel() > 0) { - dev_ctx->Wait(); + if (tensor_on_gpu_) { + dev_ctx->Wait(); + } framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, &out); dev_ctx->Wait(); @@ -89,6 +104,7 @@ struct BeamSearchDecodeFunctor { void apply() const; bool tensor_on_gpu_; + bool tensor_on_npu_; size_t beam_size_; int end_id_; // TODO(Superjomn) Here might result serious performance issue in the @@ -105,8 +121,8 @@ struct BeamSearchDecodeFunctor { template void BeamSearchDecodeFunctor::apply() const { BeamSearchDecoder beam_search_decoder(beam_size_, end_id_); - // Check if the tensor is on GPU. If so, use the CPU copy instead - if (tensor_on_gpu_) { + // Check if the tensor is on GPU or NPU. If so, use the CPU copy instead + if (tensor_on_gpu_ || tensor_on_npu_) { beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_, score_tensor_); } else { diff --git a/python/paddle/fluid/tests/unittests/npu/test_beam_search_decode_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_beam_search_decode_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..647bd29ffaef5682bfa9f2ec0ab1ab3fdbd13635 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_beam_search_decode_op_npu.py @@ -0,0 +1,109 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import numpy as np +import paddle +import paddle.fluid.core as core +from paddle.fluid.op import Operator +import paddle.fluid as fluid +from paddle.fluid.framework import Program, program_guard + + +class TestBeamSearchDecodeNPUOp(unittest.TestCase): + """unittest of beam_search_decode npu op""" + + def setUp(self): + self.scope = core.Scope() + self.place = paddle.NPUPlace(0) + + def append_lod_tensor(self, tensor_array, lod, data): + lod_tensor = core.LoDTensor() + lod_tensor.set_lod(lod) + lod_tensor.set(data, self.place) + tensor_array.append(lod_tensor) + + def test_get_set(self): + ids = self.scope.var("ids").get_lod_tensor_array() + scores = self.scope.var("scores").get_lod_tensor_array() + # Construct sample data with 5 steps and 2 source sentences + # beam_size = 2, end_id = 1 + # start with start_id + [ + self.append_lod_tensor( + array, [[0, 1, 2], [0, 1, 2]], np.array( + [0, 0], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 1, 2], [0, 2, 4]], + np.array( + [2, 3, 4, 5], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 2, 4], [0, 2, 2, 4, 4]], + np.array( + [3, 1, 5, 4], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 2, 4], [0, 1, 2, 3, 4]], + np.array( + [1, 1, 3, 5], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + [ + self.append_lod_tensor( + array, [[0, 2, 4], [0, 0, 0, 2, 2]], + np.array( + [5, 1], dtype=dtype)) + for array, dtype in ((ids, "int64"), (scores, "float32")) + ] + + sentence_ids = self.scope.var("sentence_ids").get_tensor() + sentence_scores = self.scope.var("sentence_scores").get_tensor() + + beam_search_decode_op = Operator( + "beam_search_decode", + # inputs + Ids="ids", + Scores="scores", + # outputs + SentenceIds="sentence_ids", + SentenceScores="sentence_scores", + beam_size=2, + end_id=1, ) + + beam_search_decode_op.run(self.scope, self.place) + + expected_lod = [[0, 2, 4], [0, 4, 7, 12, 17]] + self.assertEqual(sentence_ids.lod(), expected_lod) + self.assertEqual(sentence_scores.lod(), expected_lod) + + expected_data = np.array( + [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64") + self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data)) + self.assertTrue( + np.array_equal(np.array(sentence_scores), expected_data)) + + +if __name__ == '__main__': + unittest.main()