diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index 4a8dfd4b54227070c2143b180f8ab92753885550..68fb988afd8af4e9ac3acb4506c1c31fcf85e5a3 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -23,16 +23,54 @@ struct BeamSearchDecodeFunctor { BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, const LoDTensorArray& step_scores, LoDTensor* id_tensor, LoDTensor* score_tensor) - : step_ids_(step_ids), - step_scores_(step_scores), + : step_ids_origin_(step_ids), + step_scores_origin_(step_scores), id_tensor_(id_tensor), - score_tensor_(score_tensor) {} + score_tensor_(score_tensor) { + tensor_on_gpu_ = false; + // First make a copy of GPU data on CPU + if (platform::is_gpu_place(step_ids_origin_[0].place())) { + tensor_on_gpu_ = true; + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(step_ids_origin_[0].place()); + // Copy all tensors in the input tensor array + for (auto& step_id : step_ids_origin_) { + framework::LoDTensor out; + dev_ctx->Wait(); + framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out); + dev_ctx->Wait(); + + out.set_lod(step_id.lod()); + step_ids_.push_back(out); + } + } + if (platform::is_gpu_place(step_scores_origin_[0].place())) { + tensor_on_gpu_ = true; + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(step_scores_origin_[0].place()); + // Copy all tensors in the input tensor array + for (auto& step_score : step_scores_origin_) { + framework::LoDTensor out; + dev_ctx->Wait(); + framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, &out); + dev_ctx->Wait(); + + out.set_lod(step_score.lod()); + step_scores_.push_back(out); + } + } + } template void operator()() const; - const LoDTensorArray& step_ids_; - const LoDTensorArray& step_scores_; + bool tensor_on_gpu_; + const LoDTensorArray& step_ids_origin_; + const LoDTensorArray& step_scores_origin_; + LoDTensorArray step_ids_ = LoDTensorArray(); + LoDTensorArray step_scores_ = LoDTensorArray(); LoDTensor* id_tensor_; LoDTensor* score_tensor_; }; @@ -40,8 +78,14 @@ struct BeamSearchDecodeFunctor { template void BeamSearchDecodeFunctor::operator()() const { BeamSearchDecoder beam_search_decoder; - beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_, - score_tensor_); + // Check if the tensor is on GPU. If so, use the CPU copy instead + if (tensor_on_gpu_) { + beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_, + score_tensor_); + } else { + beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_, + id_tensor_, score_tensor_); + } } template <> diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py index 4ee00605e22ba45d9e46a8bba27712c3fd97872a..7976dd7c3f14390fb00bc8ab39121b6a686e3039 100644 --- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py +++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py @@ -22,12 +22,12 @@ from paddle.fluid.op import Operator class TestBeamSearchDecodeOp(unittest.TestCase): def setUp(self): self.scope = core.Scope() - self.cpu_place = core.CPUPlace() + self.place = core.CPUPlace() def append_lod_tensor(self, tensor_array, lod, data): lod_tensor = core.LoDTensor() lod_tensor.set_lod(lod) - lod_tensor.set(data, self.cpu_place) + lod_tensor.set(data, self.place) tensor_array.append(lod_tensor) def test_get_set(self): @@ -71,7 +71,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase): SentenceIds="sentence_ids", SentenceScores="sentence_scores") - beam_search_decode_op.run(self.scope, self.cpu_place) + beam_search_decode_op.run(self.scope, self.place) expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]] self.assertEqual(sentence_ids.lod(), expected_lod) @@ -84,5 +84,11 @@ class TestBeamSearchDecodeOp(unittest.TestCase): np.array_equal(np.array(sentence_scores), expected_data)) +class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp): + def setUp(self): + self.scope = core.Scope() + self.place = core.CUDAPlace(0) + + if __name__ == '__main__': unittest.main()