From adaffb7be19cc4311d5b828693fa11cbc3062c41 Mon Sep 17 00:00:00 2001 From: mengqingchun02 <103740521+mengqingchun02@users.noreply.github.com> Date: Fri, 19 Aug 2022 10:22:18 +0800 Subject: [PATCH] Support beam search decode op in XPU environment (#44917) * support beam_search operator on xpu. test=kunlun * support beam_search operator on xpu. test=kunlun * support beam_search operator on xpu. test=kunlun * support beam_search operator on xpu. test=kunlun * support beam_search operator on xpu. test=kunlun * support beam_search operator on xpu. test=kunlun * support beam_search operator on xpu. test=kunlun * fix beam_search operator bugs on xpu. test=kunlun * fix beam_search operator bugs on xpu. test=kunlun * fix beam_search operator bugs on xpu. test=kunlun * fix beam_search operator bugs on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun * support beam_search_decode operator on xpu. test=kunlun --- paddle/fluid/operators/CMakeLists.txt | 1 + .../fluid/operators/beam_search_decode_op.cc | 216 ++--------- .../operators/beam_search_decode_op.cu.cc | 25 ++ .../fluid/operators/beam_search_decode_op.h | 349 +++++++----------- .../operators/beam_search_decode_op_def.h | 241 ++++++++++++ .../operators/beam_search_decode_op_xpu.cc | 120 ++++++ .../operators/beam_search_decode_op_xpu.h | 172 +++++++++ .../beam_search_decode_op_xpu_test.cc | 187 ++++++++++ paddle/fluid/operators/beam_search_op_xpu.cc | 3 + .../fluid/operators/math/beam_search_xpu.cc | 71 +++- .../fluid/platform/device/xpu/xpu2_op_list.h | 6 + 11 files changed, 983 insertions(+), 408 deletions(-) create mode 100644 paddle/fluid/operators/beam_search_decode_op.cu.cc create mode 100644 paddle/fluid/operators/beam_search_decode_op_def.h create mode 100644 paddle/fluid/operators/beam_search_decode_op_xpu.cc create mode 100644 paddle/fluid/operators/beam_search_decode_op_xpu.h create mode 100644 paddle/fluid/operators/beam_search_decode_op_xpu_test.cc diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index a25c54bae2a..33520eaa13e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -163,6 +163,7 @@ if (WITH_GPU OR WITH_ROCM) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor) endif() if(WITH_XPU) + cc_test(beam_search_decode_op_xpu_test SRCS beam_search_decode_op_xpu_test.cc DEPS lod_tensor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib) endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment) diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index 4498862d4d4..0add5355090 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -13,195 +13,24 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/beam_search_decode_op.h" - #include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace framework { -class InferShapeContext; -class OpDesc; -class Scope; -template -class EmptyGradOpMaker; -} // namespace framework -namespace imperative { -class OpBase; -} // namespace imperative -} // namespace paddle +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { -struct BeamSearchDecodeFunctor { - BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, - LoDTensor* id_tensor, - LoDTensor* score_tensor, - size_t beam_size, - int end_id) - : beam_size_(beam_size), - end_id_(end_id), - step_ids_origin_(step_ids), - step_scores_origin_(step_scores), - id_tensor_(id_tensor), - score_tensor_(score_tensor) { - tensor_on_gpu_ = false; - tensor_on_npu_ = false; - // First make a copy of GPU data on CPU - if (platform::is_gpu_place(step_ids_origin_[0].place()) || - platform::is_npu_place(step_ids_origin_[0].place())) { - if (platform::is_gpu_place(step_ids_origin_[0].place())) { - tensor_on_gpu_ = true; - } else { - tensor_on_npu_ = true; - } - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(step_ids_origin_[0].place()); - // Copy all tensors in the input tensor array - for (auto& step_id : step_ids_origin_) { - framework::LoDTensor out; - if (step_id.numel() > 0) { - if (tensor_on_gpu_) { - dev_ctx->Wait(); - } - framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out); - dev_ctx->Wait(); - } - - out.set_lod(step_id.lod()); - step_ids_.push_back(out); - } - } - if (platform::is_gpu_place(step_scores_origin_[0].place()) || - platform::is_npu_place(step_scores_origin_[0].place())) { - if (platform::is_gpu_place(step_scores_origin_[0].place())) { - tensor_on_gpu_ = true; - } else { - tensor_on_npu_ = true; - } - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(step_scores_origin_[0].place()); - // Copy all tensors in the input tensor array - for (auto& step_score : step_scores_origin_) { - framework::LoDTensor out; - if (step_score.numel() > 0) { - if (tensor_on_gpu_) { - dev_ctx->Wait(); - } - framework::TensorCopy( - step_score, platform::CPUPlace(), *dev_ctx, &out); - dev_ctx->Wait(); - } - - out.set_lod(step_score.lod()); - step_scores_.push_back(out); - } - } - } - - template - void apply() const; - - bool tensor_on_gpu_; - bool tensor_on_npu_; - size_t beam_size_; - int end_id_; - // TODO(Superjomn) Here might result serious performance issue in the - // concurrency - // scenarios. - const LoDTensorArray& step_ids_origin_; - const LoDTensorArray& step_scores_origin_; - LoDTensorArray step_ids_ = LoDTensorArray(); - LoDTensorArray step_scores_ = LoDTensorArray(); - LoDTensor* id_tensor_; - LoDTensor* score_tensor_; -}; - -template -void BeamSearchDecodeFunctor::apply() const { - BeamSearchDecoder beam_search_decoder(beam_size_, end_id_); - // Check if the tensor is on GPU or NPU. If so, use the CPU copy instead - if (tensor_on_gpu_ || tensor_on_npu_) { - beam_search_decoder.Backtrace( - step_ids_, step_scores_, id_tensor_, score_tensor_); - } else { - beam_search_decoder.Backtrace( - step_ids_origin_, step_scores_origin_, id_tensor_, score_tensor_); - } -} - -template <> -void BeamSearchDecodeFunctor::apply() const { - PADDLE_THROW(platform::errors::InvalidArgument( - "beam search decode op does not support bool!")); -} - -class BeamSearchDecodeOp : public framework::OperatorBase { +class BeamSearchDecodeOp : public framework::OperatorWithKernel { public: - BeamSearchDecodeOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& dev_ctx = *pool.Get(dev_place); - - framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); - framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); - - const LoDTensorArray* ids = ctx.Input("Ids"); - const LoDTensorArray* scores = ctx.Input("Scores"); - const size_t step_num = ids->size(); - PADDLE_ENFORCE_GT( - step_num, - 0UL, - platform::errors::InvalidArgument( - "beam search steps, which is the" - "size of Input(Ids) LoDTensorArray. beam search steps should " - "be larger than 0, but received %d. ", - step_num)); - const size_t source_num = ids->at(0).lod().at(0).size() - 1; - PADDLE_ENFORCE_GT( - source_num, - 0UL, - platform::errors::InvalidArgument( - "source_num is the sequence number of the" - "first decoding step, indicating by Input(Ids)[0].lod[0].size. " - "The number of source_num should be larger than" - "0, but received %d. ", - source_num)); + using framework::OperatorWithKernel::OperatorWithKernel; - for (size_t i = 0; i < step_num; ++i) { - PADDLE_ENFORCE_EQ( - ids->at(i).lod().size(), - 2UL, - platform::errors::InvalidArgument( - "For the i step in beam search steps," - "the size of Input(Ids)[i].lod() should larger than 2," - "but received %d. ", - ids->at(i).lod().size())); + void InferShape(framework::InferShapeContext *ctx) const override { + for (const std::string &arg : std::vector({"Ids", "Scores"})) { + OP_INOUT_CHECK(ctx->HasInput(arg), "Input", arg, "BeamSeachDecode"); + } + for (const std::string &arg : + std::vector({"SentenceIds", "SentenceScores"})) { + OP_INOUT_CHECK(ctx->HasOutput(arg), "Output", arg, "BeamSeachDecode"); } - - size_t beam_size = ctx.Attr("beam_size"); - int end_id = ctx.Attr("end_id"); - - // prepare output - LoDTensor* sentenceIds = ctx.Output("SentenceIds"); - LoDTensor* sentenceScores = ctx.Output("SentenceScores"); - - framework::VisitDataType( - framework::TransToProtoVarType(scores->at(0).dtype()), - BeamSearchDecodeFunctor( - *ids, *scores, sentenceIds, sentenceScores, beam_size, end_id)); } }; @@ -240,7 +69,7 @@ hypothesis has. } }; -class BeamSearchDecodeInferShape : public framework::InferShapeBase { +/*class BeamSearchDecodeInferShape : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext* context) const override { OP_INOUT_CHECK( @@ -256,11 +85,11 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase { "SentenceScores", "BeamSearchDecode"); } -}; +};*/ class BeamSearchDecodeInferVarType : public framework::VarTypeInference { public: - void operator()(framework::InferVarTypeContext* ctx) const override { + void operator()(framework::InferVarTypeContext *ctx) const override { ctx->SetOutputType("SentenceIds", framework::proto::VarType::LOD_TENSOR, framework::ALL_ELEMENTS); @@ -273,11 +102,16 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference { } // namespace operators } // namespace paddle -REGISTER_OPERATOR( +namespace ops = paddle::operators; +REGISTER_OPERATOR(beam_search_decode, + paddle::operators::BeamSearchDecodeOp, + paddle::operators::BeamSearchDecodeOpProtoMaker, + paddle::operators::BeamSearchDecodeInferVarType); + +REGISTER_OP_CPU_KERNEL( beam_search_decode, - paddle::operators::BeamSearchDecodeOp, - paddle::operators::BeamSearchDecodeOpProtoMaker, - paddle::operators::BeamSearchDecodeInferShape, - paddle::operators::BeamSearchDecodeInferVarType, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + ops::BeamSearchDecodeOpKernel, + ops::BeamSearchDecodeOpKernel, + ops::BeamSearchDecodeOpKernel, + ops::BeamSearchDecodeOpKernel, + ops::BeamSearchDecodeOpKernel); diff --git a/paddle/fluid/operators/beam_search_decode_op.cu.cc b/paddle/fluid/operators/beam_search_decode_op.cu.cc new file mode 100644 index 00000000000..fef36ea6d9a --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op.cu.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_decode_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + beam_search_decode, + ops::BeamSearchDecodeOpKernel, + ops::BeamSearchDecodeOpKernel, + ops::BeamSearchDecodeOpKernel, + ops::BeamSearchDecodeOpKernel, + ops::BeamSearchDecodeOpKernel); diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h index 2800ef39074..7e5da360a94 100644 --- a/paddle/fluid/operators/beam_search_decode_op.h +++ b/paddle/fluid/operators/beam_search_decode_op.h @@ -14,231 +14,164 @@ limitations under the License. */ #pragma once -#include -#include -#include - -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/beam_search_decode_op_def.h" namespace paddle { namespace operators { -using LoDTensor = framework::LoDTensor; -using LoDTensorArray = framework::LoDTensorArray; +struct BeamSearchDecodeFunctor { + BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, + LoDTensor* score_tensor, + size_t beam_size, + int end_id) + : beam_size_(beam_size), + end_id_(end_id), + step_ids_origin_(step_ids), + step_scores_origin_(step_scores), + id_tensor_(id_tensor), + score_tensor_(score_tensor) { + tensor_on_gpu_ = false; + tensor_on_npu_ = false; + // First make a copy of GPU data on CPU + if (platform::is_gpu_place(step_ids_origin_[0].place()) || + platform::is_npu_place(step_ids_origin_[0].place())) { + if (platform::is_gpu_place(step_ids_origin_[0].place())) { + tensor_on_gpu_ = true; + } else { + tensor_on_npu_ = true; + } + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(step_ids_origin_[0].place()); + // Copy all tensors in the input tensor array + for (auto& step_id : step_ids_origin_) { + framework::LoDTensor out; + if (step_id.numel() > 0) { + if (tensor_on_gpu_) { + dev_ctx->Wait(); + } + framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out); + dev_ctx->Wait(); + } -// all the lod have 2 levels. -// The first is source level, the second is sentence level. -// source level describe how many prefixes (branchs) for each source sentece -// (beam). sentence level describe how these candidates belong to the prefixes. -const size_t kSourceLevel = 0; -const size_t kSentenceLevel = 1; + out.set_lod(step_id.lod()); + step_ids_.push_back(out); + } + } + if (platform::is_gpu_place(step_scores_origin_[0].place()) || + platform::is_npu_place(step_scores_origin_[0].place())) { + if (platform::is_gpu_place(step_scores_origin_[0].place())) { + tensor_on_gpu_ = true; + } else { + tensor_on_npu_ = true; + } + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(step_scores_origin_[0].place()); + // Copy all tensors in the input tensor array + for (auto& step_score : step_scores_origin_) { + framework::LoDTensor out; + if (step_score.numel() > 0) { + if (tensor_on_gpu_) { + dev_ctx->Wait(); + } + framework::TensorCopy( + step_score, platform::CPUPlace(), *dev_ctx, &out); + dev_ctx->Wait(); + } -template -struct Sentence { - std::vector word_ids; - std::vector scores; -}; + out.set_lod(step_score.lod()); + step_scores_.push_back(out); + } + } + } -template -using SentenceVector = std::vector>; - -template -struct BeamSearchDecoder { - BeamSearchDecoder(size_t beam_size, int end_id) - : beam_size_(beam_size), end_id_(end_id) {} - - /** - * convert the result sentence_vector for each source sentence into two - * LodTensor. - * One is all candidate sentences with word id, one is all candidate sentences - * with word score. - * Param: - * sentence_vector_list: sentence_vector for each source sentence. - * id_tensor: result LoDTensor for sentences of id. - * score_tensor: result LoDTensor for sentences of score. - * reverse: whether ids of sentence in sentence_vector_list is reversed - * sort_by_score: whether to sort hypotheses of each sentence by scores. - */ - void ConvertSentenceVectorToLodTensor( - std::vector> sentence_vector_list, - LoDTensor* id_tensor, - LoDTensor* score_tensor, - bool reverse = true, - bool sort_by_score = true) const; - - /** - * Gather the hypotheses for each source sentence by backtrace though the - * LoDTensorArray step_ids whose lods reserve the path in the tree. - */ - void Backtrace(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, - LoDTensor* id_tensor, - LoDTensor* score_tensor) const; + template + void apply_mix() const { + if (std::is_same::value) { + PADDLE_THROW(platform::errors::InvalidArgument( + "beam search decode op does not support bool!")); + + } else { + BeamSearchDecoder beam_search_decoder(beam_size_, end_id_); + // Check if the tensor is on GPU or NPU. If so, use the CPU copy instead + if (tensor_on_gpu_ || tensor_on_npu_) { + beam_search_decoder.Backtrace( + step_ids_, step_scores_, id_tensor_, score_tensor_); + } else { + beam_search_decoder.Backtrace( + step_ids_origin_, step_scores_origin_, id_tensor_, score_tensor_); + } + } + } + bool tensor_on_gpu_; + bool tensor_on_npu_; size_t beam_size_; int end_id_; + // TODO(Superjomn) Here might result serious performance issue in the + // concurrency + // scenarios. + const LoDTensorArray& step_ids_origin_; + const LoDTensorArray& step_scores_origin_; + LoDTensorArray step_ids_ = LoDTensorArray(); + LoDTensorArray step_scores_ = LoDTensorArray(); + LoDTensor* id_tensor_; + LoDTensor* score_tensor_; }; -template -void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( - std::vector> sentence_vector_list, - LoDTensor* id_tensor, - LoDTensor* score_tensor, - bool reverse, - bool sort_by_score) const { - size_t src_num = sentence_vector_list.size(); - - PADDLE_ENFORCE_NE( - src_num, - 0, - platform::errors::InvalidArgument( - "src_num is the sequence number of the first decoding step" - ", indicating by Input(Ids)[0].lod[0].size." - "src_num has wrong value." - "src_num should not be 0," - "But received %d.", - src_num)); - - std::vector source_level_lod = {0}; - std::vector sentence_level_lod = {0}; - std::vector id_data; - std::vector score_data; - - for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { - if (sort_by_score) { - sort(sentence_vector_list[src_idx].begin(), - sentence_vector_list[src_idx].end(), - [reverse](const Sentence& a, const Sentence& b) { - if (reverse) - return a.scores.front() > b.scores.front(); - else - return a.scores.back() > b.scores.back(); - }); +template +class BeamSearchDecodeOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const LoDTensorArray* ids = context.Input("Ids"); + const LoDTensorArray* scores = context.Input("Scores"); + const size_t step_num = ids->size(); + PADDLE_ENFORCE_GT( + step_num, + 0UL, + platform::errors::InvalidArgument( + "beam search steps, which is the" + "size of Input(Ids) LoDTensorArray. beam search steps should " + "be larger than 0, but received %d. ", + step_num)); + const size_t source_num = ids->at(0).lod().at(0).size() - 1; + PADDLE_ENFORCE_GT( + source_num, + 0UL, + platform::errors::InvalidArgument( + "source_num is the sequence number of the" + "first decoding step, indicating by Input(Ids)[0].lod[0].size. " + "The number of source_num should be larger than" + "0, but received %d. ", + source_num)); + + for (size_t i = 0; i < step_num; ++i) { + PADDLE_ENFORCE_EQ( + ids->at(i).lod().size(), + 2UL, + platform::errors::InvalidArgument( + "For the i step in beam search steps," + "the size of Input(Ids)[i].lod() should larger than 2," + "but received %d. ", + ids->at(i).lod().size())); } - for (Sentence& sentence : sentence_vector_list[src_idx]) { - if (reverse) { - id_data.insert(id_data.end(), - sentence.word_ids.rbegin(), - sentence.word_ids.rend()); - score_data.insert( - score_data.end(), sentence.scores.rbegin(), sentence.scores.rend()); - } else { - id_data.insert( - id_data.end(), sentence.word_ids.begin(), sentence.word_ids.end()); - score_data.insert( - score_data.end(), sentence.scores.begin(), sentence.scores.end()); - } - sentence_level_lod.push_back(sentence_level_lod.back() + - sentence.word_ids.size()); - } - source_level_lod.push_back(source_level_lod.back() + - sentence_vector_list[src_idx].size()); - } + size_t beam_size = context.Attr("beam_size"); + int end_id = context.Attr("end_id"); - auto cpu_place = std::unique_ptr( - new paddle::platform::CPUPlace()); - phi::CPUContext cpu_ctx(*cpu_place); - - framework::LoD lod; - lod.push_back(source_level_lod); - lod.push_back(sentence_level_lod); - - id_tensor->set_lod(lod); - id_tensor->Resize({static_cast(id_data.size())}); - id_tensor->mutable_data(paddle::platform::CPUPlace()); - framework::TensorFromVector(id_data, cpu_ctx, id_tensor); - - score_tensor->set_lod(lod); - score_tensor->Resize({static_cast(score_data.size())}); - score_tensor->mutable_data(paddle::platform::CPUPlace()); - framework::TensorFromVector(score_data, cpu_ctx, score_tensor); -} - -template -void BeamSearchDecoder::Backtrace(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, - LoDTensor* id_tensor, - LoDTensor* score_tensor) const { - PADDLE_ENFORCE_NE( - step_ids.empty(), - true, - platform::errors::InvalidArgument("Input(Ids) should not be empty." - "But the Input(Ids) is empty.")); - PADDLE_ENFORCE_EQ( - step_ids.size(), - step_scores.size(), - platform::errors::InvalidArgument( - "The size of Input(Ids) and Input(Scores) should be " - "the same. But the size of Input(Ids) and Input(Scores) " - "are not equal.")); - const size_t step_num = step_ids.size(); - const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1; - std::vector> sentence_vector_list( - src_num, SentenceVector(beam_size_)); - std::vector> prefix_idx_vector_list(src_num); - for (int step_id = step_num - 1; step_id >= 0; --step_id) { - auto& cur_ids = step_ids.at(step_id); - auto& cur_scores = step_scores.at(step_id); - for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { - // for each source sentence - auto& sentence_vector = sentence_vector_list.at(src_idx); - auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx); - size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx]; - size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1]; - if (prefix_idx_vector.empty()) { // be finished and pruned at this step - // or the last time step - for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end; - ++prefix_idx) { - size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx]; - size_t candidate_end = - cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1]; - for (size_t candidate_idx = candidate_start; - candidate_idx < candidate_end; - ++candidate_idx) { - prefix_idx_vector.push_back(prefix_idx); - size_t idx = prefix_idx_vector.size() - 1; - auto cur_id = cur_ids.data()[candidate_idx]; - auto cur_score = cur_scores.data()[candidate_idx]; - sentence_vector.at(idx).word_ids.push_back(cur_id); - sentence_vector.at(idx).scores.push_back(cur_score); - } - } - } else { // use prefix_idx_vector to backtrace - size_t src_candidate_start = - cur_ids.lod().at(kSentenceLevel)[src_prefix_start]; - size_t prefix_idx = src_prefix_start; - size_t candidate_num = - cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - - cur_ids.lod().at(kSentenceLevel)[prefix_idx]; - for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) { - auto candidate_idx = prefix_idx_vector.at(idx); - auto cur_id = cur_ids.data()[candidate_idx]; - auto cur_score = cur_scores.data()[candidate_idx]; - if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) { - // to skip redundant end tokens - sentence_vector.at(idx).word_ids.push_back(cur_id); - sentence_vector.at(idx).scores.push_back(cur_score); - } + // prepare output + LoDTensor* sentenceIds = context.Output("SentenceIds"); + LoDTensor* sentenceScores = context.Output("SentenceScores"); - while (src_candidate_start + candidate_num <= - candidate_idx) { // search the corresponding prefix - prefix_idx++; - candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - - cur_ids.lod().at(kSentenceLevel)[prefix_idx]; - } - prefix_idx_vector.at(idx) = prefix_idx; - } - } - } + BeamSearchDecodeFunctor bs( + *ids, *scores, sentenceIds, sentenceScores, beam_size, end_id); + bs.apply_mix(); } - - ConvertSentenceVectorToLodTensor( - sentence_vector_list, id_tensor, score_tensor, true, true); -} +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/beam_search_decode_op_def.h b/paddle/fluid/operators/beam_search_decode_op_def.h new file mode 100644 index 00000000000..14ac6291971 --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op_def.h @@ -0,0 +1,241 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +using LoDTensor = framework::LoDTensor; +using LoDTensorArray = framework::LoDTensorArray; + +// all the lod have 2 levels. +// The first is source level, the second is sentence level. +// source level describe how many prefixes (branchs) for each source sentece +// (beam). sentence level describe how these candidates belong to the prefixes. +const size_t kSourceLevel = 0; +const size_t kSentenceLevel = 1; + +template +struct Sentence { + std::vector word_ids; + std::vector scores; +}; + +template +using SentenceVector = std::vector>; + +template +struct BeamSearchDecoder { + BeamSearchDecoder(size_t beam_size, int end_id) + : beam_size_(beam_size), end_id_(end_id) {} + + /** + * convert the result sentence_vector for each source sentence into two + * LodTensor. + * One is all candidate sentences with word id, one is all candidate sentences + * with word score. + * Param: + * sentence_vector_list: sentence_vector for each source sentence. + * id_tensor: result LoDTensor for sentences of id. + * score_tensor: result LoDTensor for sentences of score. + * reverse: whether ids of sentence in sentence_vector_list is reversed + * sort_by_score: whether to sort hypotheses of each sentence by scores. + */ + void ConvertSentenceVectorToLodTensor( + std::vector> sentence_vector_list, + LoDTensor* id_tensor, + LoDTensor* score_tensor, + bool reverse = true, + bool sort_by_score = true) const; + + /** + * Gather the hypotheses for each source sentence by backtrace though the + * LoDTensorArray step_ids whose lods reserve the path in the tree. + */ + void Backtrace(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, + LoDTensor* score_tensor) const; + + size_t beam_size_; + int end_id_; +}; + +template +void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( + std::vector> sentence_vector_list, + LoDTensor* id_tensor, + LoDTensor* score_tensor, + bool reverse, + bool sort_by_score) const { + size_t src_num = sentence_vector_list.size(); + + PADDLE_ENFORCE_NE( + src_num, + 0, + platform::errors::InvalidArgument( + "src_num is the sequence number of the first decoding step" + ", indicating by Input(Ids)[0].lod[0].size." + "src_num has wrong value." + "src_num should not be 0," + "But received %d.", + src_num)); + + std::vector source_level_lod = {0}; + std::vector sentence_level_lod = {0}; + std::vector id_data; + std::vector score_data; + + for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + if (sort_by_score) { + sort(sentence_vector_list[src_idx].begin(), + sentence_vector_list[src_idx].end(), + [reverse](const Sentence& a, const Sentence& b) { + if (reverse) + return a.scores.front() > b.scores.front(); + else + return a.scores.back() > b.scores.back(); + }); + } + for (Sentence& sentence : sentence_vector_list[src_idx]) { + if (reverse) { + id_data.insert(id_data.end(), + sentence.word_ids.rbegin(), + sentence.word_ids.rend()); + score_data.insert( + score_data.end(), sentence.scores.rbegin(), sentence.scores.rend()); + } else { + id_data.insert( + id_data.end(), sentence.word_ids.begin(), sentence.word_ids.end()); + score_data.insert( + score_data.end(), sentence.scores.begin(), sentence.scores.end()); + } + + sentence_level_lod.push_back(sentence_level_lod.back() + + sentence.word_ids.size()); + } + source_level_lod.push_back(source_level_lod.back() + + sentence_vector_list[src_idx].size()); + } + + auto cpu_place = std::unique_ptr( + new paddle::platform::CPUPlace()); + phi::CPUContext cpu_ctx(*cpu_place); + + framework::LoD lod; + lod.push_back(source_level_lod); + lod.push_back(sentence_level_lod); + + id_tensor->set_lod(lod); + id_tensor->Resize({static_cast(id_data.size())}); + id_tensor->mutable_data(paddle::platform::CPUPlace()); + framework::TensorFromVector(id_data, cpu_ctx, id_tensor); + + score_tensor->set_lod(lod); + score_tensor->Resize({static_cast(score_data.size())}); + score_tensor->mutable_data(paddle::platform::CPUPlace()); + framework::TensorFromVector(score_data, cpu_ctx, score_tensor); +} + +template +void BeamSearchDecoder::Backtrace(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, + LoDTensor* score_tensor) const { + PADDLE_ENFORCE_NE( + step_ids.empty(), + true, + platform::errors::InvalidArgument("Input(Ids) should not be empty." + "But the Input(Ids) is empty.")); + PADDLE_ENFORCE_EQ( + step_ids.size(), + step_scores.size(), + platform::errors::InvalidArgument( + "The size of Input(Ids) and Input(Scores) should be " + "the same. But the size of Input(Ids) and Input(Scores) " + "are not equal.")); + const size_t step_num = step_ids.size(); + const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1; + std::vector> sentence_vector_list( + src_num, SentenceVector(beam_size_)); + std::vector> prefix_idx_vector_list(src_num); + for (int step_id = step_num - 1; step_id >= 0; --step_id) { + auto& cur_ids = step_ids.at(step_id); + auto& cur_scores = step_scores.at(step_id); + for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + // for each source sentence + auto& sentence_vector = sentence_vector_list.at(src_idx); + auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx); + size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx]; + size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1]; + if (prefix_idx_vector.empty()) { // be finished and pruned at this step + // or the last time step + for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end; + ++prefix_idx) { + size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + size_t candidate_end = + cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1]; + for (size_t candidate_idx = candidate_start; + candidate_idx < candidate_end; + ++candidate_idx) { + prefix_idx_vector.push_back(prefix_idx); + size_t idx = prefix_idx_vector.size() - 1; + auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_score = cur_scores.data()[candidate_idx]; + sentence_vector.at(idx).word_ids.push_back(cur_id); + sentence_vector.at(idx).scores.push_back(cur_score); + } + } + } else { // use prefix_idx_vector to backtrace + size_t src_candidate_start = + cur_ids.lod().at(kSentenceLevel)[src_prefix_start]; + size_t prefix_idx = src_prefix_start; + size_t candidate_num = + cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - + cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) { + auto candidate_idx = prefix_idx_vector.at(idx); + auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_score = cur_scores.data()[candidate_idx]; + if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) { + // to skip redundant end tokens + sentence_vector.at(idx).word_ids.push_back(cur_id); + sentence_vector.at(idx).scores.push_back(cur_score); + } + + while (src_candidate_start + candidate_num <= + candidate_idx) { // search the corresponding prefix + prefix_idx++; + candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - + cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + } + prefix_idx_vector.at(idx) = prefix_idx; + } + } + } + } + + ConvertSentenceVectorToLodTensor( + sentence_vector_list, id_tensor, score_tensor, true, true); +} + +} // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/beam_search_decode_op_xpu.cc b/paddle/fluid/operators/beam_search_decode_op_xpu.cc new file mode 100644 index 00000000000..cfea2f57da2 --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op_xpu.cc @@ -0,0 +1,120 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/beam_search_decode_op_xpu.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class BeamSearchDecodeXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const LoDTensorArray* ids = context.Input("Ids"); + const LoDTensorArray* scores = context.Input("Scores"); + const size_t step_num = ids->size(); + PADDLE_ENFORCE_GT( + step_num, + 0UL, + platform::errors::InvalidArgument( + "beam search steps, which is the" + "size of Input(Ids) LoDTensorArray. beam search steps should " + "be larger than 0, but received %d. ", + step_num)); + + const size_t source_num = ids->at(0).lod().at(0).size() - 1; + PADDLE_ENFORCE_GT( + source_num, + 0UL, + platform::errors::InvalidArgument( + "source_num is the sequence number of the" + "first decoding step, indicating by Input(Ids)[0].lod[0].size. " + "The number of source_num should be larger than" + "0, but received %d. ", + source_num)); + + for (size_t i = 0; i < step_num; ++i) { + PADDLE_ENFORCE_EQ( + ids->at(i).lod().size(), + 2UL, + platform::errors::InvalidArgument( + "For the i step in beam search steps," + "the size of Input(Ids)[i].lod() should larger than 2," + "but received %d. ", + ids->at(i).lod().size())); + } + + size_t beam_size = context.Attr("beam_size"); + int end_id = context.Attr("end_id"); + + // prepare output + LoDTensor* sentenceIds = nullptr; + LoDTensor* sentenceScores = nullptr; + + LoDTensor* sentenceIds_temp = context.Output("SentenceIds"); + LoDTensor* sentenceScores_temp = + context.Output("SentenceScores"); + + if (platform::is_xpu_place(ids->at(0).place())) { + sentenceIds = new LoDTensor(); + sentenceIds->set_lod(sentenceIds_temp->lod()); + } + + if (platform::is_xpu_place(ids->at(0).place())) { + sentenceScores = new LoDTensor(); + sentenceScores->set_lod(sentenceScores_temp->lod()); + } + + BeamSearchDecodeXPUFunctor bs_xpu( + *ids, *scores, sentenceIds, sentenceScores, beam_size, end_id); + bs_xpu.apply_xpu(); + + if (platform::is_xpu_place(ids->at(0).place())) { + int r = 0; + r = CopyTensorByXPU( + *sentenceIds, sentenceIds_temp, 1, ids->at(0).place()); + PADDLE_ENFORCE_EQ( + r, + xpu::Error_t::SUCCESS, + platform::errors::External( + "Execute function CopyTensorByXPU failed by [%d]", r)); + + r = CopyTensorByType( + *sentenceScores, sentenceScores_temp, 1, ids->at(0).place()); + PADDLE_ENFORCE_EQ( + r, + xpu::Error_t::SUCCESS, + platform::errors::External( + "Execute function CopyTensorByXPU failed by [%d]", r)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + beam_search_decode, + ops::BeamSearchDecodeXPUKernel, + ops::BeamSearchDecodeXPUKernel, + ops::BeamSearchDecodeXPUKernel, + ops::BeamSearchDecodeXPUKernel, + ops::BeamSearchDecodeXPUKernel); +#endif diff --git a/paddle/fluid/operators/beam_search_decode_op_xpu.h b/paddle/fluid/operators/beam_search_decode_op_xpu.h new file mode 100644 index 00000000000..839cfb1ce22 --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op_xpu.h @@ -0,0 +1,172 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include "paddle/fluid/operators/beam_search_decode_op_def.h" + +namespace paddle { +namespace operators { + +int SetMeta(const LoDTensor& srcTensor, LoDTensor* dstTensor) { + if (srcTensor.dtype() == paddle::experimental::DataType::INT32 || + srcTensor.dtype() == paddle::experimental::DataType::INT64 || + srcTensor.dtype() == paddle::experimental::DataType::FLOAT32 || + srcTensor.dtype() == paddle::experimental::DataType::FLOAT16 || + srcTensor.dtype() == paddle::experimental::DataType::FLOAT64) { + const phi::DenseTensorMeta meta_data(srcTensor.dtype(), srcTensor.dims()); + dstTensor->set_meta(meta_data); + } else { + return xpu::Error_t::INVALID_PARAM; + } + + return xpu::Error_t::SUCCESS; +} +template +int CopyTensorByXPU(const LoDTensor& srcTensor, + LoDTensor* dstTensor, + int flag, + const Place& place) { + const T* srcData = srcTensor.template data(); + if (nullptr == srcData || nullptr == dstTensor || flag < 0 || flag > 1) + return xpu::Error_t::INVALID_PARAM; + + int r = SetMeta(srcTensor, dstTensor); + PADDLE_ENFORCE_EQ( + r, + xpu::Error_t::SUCCESS, + platform::errors::External("Execute function SetMeta failed by [%d]", r)); + + if (flag == 0) { + T* dstData = + dstTensor->template mutable_data(paddle::platform::CPUPlace()); + paddle::memory::Copy(paddle::platform::CPUPlace(), + dstData, + place, + srcData, + srcTensor.numel() * sizeof(T)); + } else { + T* dstData = dstTensor->template mutable_data(place); + paddle::memory::Copy(place, + dstData, + paddle::platform::CPUPlace(), + srcData, + srcTensor.numel() * sizeof(T)); + } + + return xpu::Error_t::SUCCESS; +} + +const int CopyTensorByType(const LoDTensor& srcTensor, + LoDTensor* dstTensor, + int flag, + const Place& place) { + int r = 0; + if (srcTensor.dtype() == paddle::experimental::DataType::FLOAT32) + r = CopyTensorByXPU(srcTensor, dstTensor, flag, place); + else if (srcTensor.dtype() == paddle::experimental::DataType::FLOAT16) + r = CopyTensorByXPU( + srcTensor, dstTensor, flag, place); + else if (srcTensor.dtype() == paddle::experimental::DataType::FLOAT64) + r = CopyTensorByXPU(srcTensor, dstTensor, flag, place); + else if (srcTensor.dtype() == paddle::experimental::DataType::INT32) + r = CopyTensorByXPU(srcTensor, dstTensor, flag, place); + else if (srcTensor.dtype() == paddle::experimental::DataType::INT64) + r = CopyTensorByXPU(srcTensor, dstTensor, flag, place); + else + return xpu::Error_t::INVALID_PARAM; + + PADDLE_ENFORCE_EQ(r, + xpu::Error_t::SUCCESS, + platform::errors::External( + "Execute function CopyTensorByXPU failed by [%d]", r)); + + return xpu::Error_t::SUCCESS; +} + +struct BeamSearchDecodeXPUFunctor { + BeamSearchDecodeXPUFunctor(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, + LoDTensor* score_tensor, + size_t beam_size, + int end_id) + : beam_size_(beam_size), + end_id_(end_id), + id_tensor_(id_tensor), + score_tensor_(score_tensor) { + int r = 0; + + // First make a copy of XPU data on CPU + if (platform::is_xpu_place(step_ids[0].place())) { + // Copy all tensors in the input tensor array + for (auto& step_id : step_ids) { + framework::LoDTensor out; + if (step_id.numel() > 0) { + r = CopyTensorByType(step_id, &out, 0, step_ids[0].place()); + PADDLE_ENFORCE_EQ( + r, + xpu::Error_t::SUCCESS, + platform::errors::External( + "Execute function CopyTensorByXPU failed by [%d]", r)); + } + + out.set_lod(step_id.lod()); + step_ids_.push_back(out); + } + } + + if (platform::is_xpu_place(step_scores[0].place())) { + // Copy all tensors in the input tensor array + for (auto& step_score : step_scores) { + framework::LoDTensor out; + if (step_score.numel() > 0) { + r = CopyTensorByType(step_score, &out, 0, step_scores[0].place()); + PADDLE_ENFORCE_EQ( + r, + xpu::Error_t::SUCCESS, + platform::errors::External( + "Execute function CopyTensorByType failed by [%d]", r)); + } + + out.set_lod(step_score.lod()); + step_scores_.push_back(out); + } + } + } + + template + void apply_xpu() const { + if (std::is_same::value) { + PADDLE_THROW(platform::errors::InvalidArgument( + "beam search decode op does not support bool!")); + } else { + BeamSearchDecoder beam_search_decoder(beam_size_, end_id_); + beam_search_decoder.Backtrace( + step_ids_, step_scores_, id_tensor_, score_tensor_); + } + } + + size_t beam_size_; + int end_id_; + // TODO(Superjomn) Here might result serious performance issue in the + // concurrency + // scenarios. + LoDTensorArray step_ids_ = LoDTensorArray(); + LoDTensorArray step_scores_ = LoDTensorArray(); + LoDTensor* id_tensor_; + LoDTensor* score_tensor_; +}; + +} // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/beam_search_decode_op_xpu_test.cc b/paddle/fluid/operators/beam_search_decode_op_xpu_test.cc new file mode 100644 index 00000000000..7dd45371f6b --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op_xpu_test.cc @@ -0,0 +1,187 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_decode_op_xpu.h" + +#include "gtest/gtest.h" + +using CPUPlace = paddle::platform::CPUPlace; +using XPUPlace = paddle::platform::XPUPlace; +using LoD = paddle::framework::LoD; +using LoDTensor = paddle::framework::LoDTensor; +using LoDTensorArray = paddle::framework::LoDTensorArray; + +template +using BeamSearchDecoder = paddle::operators::BeamSearchDecoder; +template +using Sentence = paddle::operators::Sentence; +template +using SentenceVector = paddle::operators::SentenceVector; + +namespace paddle { +namespace test { + +void GenerateXPUExample(const std::vector& level_0, + const std::vector& level_1, + const std::vector& data, + LoDTensorArray* ids, + LoDTensorArray* scores) { + PADDLE_ENFORCE_EQ(level_0.back(), + level_1.size() - 1, + platform::errors::InvalidArgument( + "source level is used to describe candidate set" + ", so it's element should less than levle_1 length. " + "And the value of source" + "level is %d. ", + level_1.size() - 1)); + PADDLE_ENFORCE_EQ(level_1.back(), + data.size(), + platform::errors::InvalidArgument( + "the lowest level is used to describe data" + ", so it's last element should be data length %d. ", + data.size())); + + CPUPlace place; + int XPU_PlaceNo = 0; + if (std::getenv("FLAGS_selected_xpus") != nullptr) + XPU_PlaceNo = atoi(std::getenv("FLAGS_selected_xpus")); + else if (std::getenv("XPU_VISIBLE_DEVICES") != nullptr) + XPU_PlaceNo = atoi(std::getenv("XPU_VISIBLE_DEVICES")); + + XPUPlace xpu_place(XPU_PlaceNo); + + LoD lod; + lod.push_back(level_0); + lod.push_back(level_1); + + // Ids + LoDTensor tensor_id_cpu; + tensor_id_cpu.set_lod(lod); + tensor_id_cpu.Resize({static_cast(data.size())}); + // malloc memory + int64_t* id_cpu_ptr = tensor_id_cpu.mutable_data(place); + for (size_t i = 0; i < data.size(); ++i) { + id_cpu_ptr[i] = static_cast(data.at(i)); + } + + LoDTensor tensor_id; + const phi::DenseTensorMeta meta_data_id(paddle::experimental::DataType::INT64, + tensor_id_cpu.dims()); + tensor_id.set_meta(meta_data_id); + tensor_id.set_lod(lod); + + int64_t* id_ptr = tensor_id.mutable_data(xpu_place); + paddle::memory::Copy(paddle::platform::XPUPlace(XPU_PlaceNo), + id_ptr, + paddle::platform::CPUPlace(), + id_cpu_ptr, + tensor_id_cpu.numel() * sizeof(int64_t)); + + // Scores + LoDTensor tensor_score_cpu; + tensor_score_cpu.set_lod(lod); + tensor_score_cpu.Resize({static_cast(data.size())}); + // malloc memory + float* score_cpu_ptr = tensor_score_cpu.mutable_data(place); + for (size_t i = 0; i < data.size(); ++i) { + score_cpu_ptr[i] = static_cast(data.at(i)); + } + + LoDTensor tensor_score; + const phi::DenseTensorMeta meta_data_score( + paddle::experimental::DataType::FLOAT32, tensor_score_cpu.dims()); + tensor_score.set_meta(meta_data_score); + tensor_score.set_lod(lod); + + float* score_ptr = tensor_score.mutable_data(xpu_place); + paddle::memory::Copy(paddle::platform::XPUPlace(XPU_PlaceNo), + score_ptr, + paddle::platform::CPUPlace(), + score_cpu_ptr, + tensor_score_cpu.numel() * sizeof(float)); + + ids->push_back(tensor_id); + scores->push_back(tensor_score); +} + +} // namespace test +} // namespace paddle + +TEST(BeamSearchDecodeOpXPU, Backtrace) { + CPUPlace place; + + // Construct sample data with 5 steps and 2 source sentences + // beam_size = 2, start_id = 0, end_id = 1 + LoDTensorArray ids; + LoDTensorArray scores; + + paddle::test::GenerateXPUExample(std::vector{0, 1, 2}, + std::vector{0, 1, 2}, + std::vector{0, 0}, + &ids, + &scores); // start with start_id + paddle::test::GenerateXPUExample(std::vector{0, 1, 2}, + std::vector{0, 2, 4}, + std::vector{2, 3, 4, 5}, + &ids, + &scores); + paddle::test::GenerateXPUExample(std::vector{0, 2, 4}, + std::vector{0, 2, 2, 4, 4}, + std::vector{3, 1, 5, 4}, + &ids, + &scores); + paddle::test::GenerateXPUExample(std::vector{0, 2, 4}, + std::vector{0, 1, 2, 3, 4}, + std::vector{1, 1, 3, 5}, + &ids, + &scores); + paddle::test::GenerateXPUExample( + std::vector{0, 2, 4}, + std::vector{0, 0, 0, 2, 2}, // the branchs of the first source + // sentence are pruned since finished + std::vector{5, 1}, + &ids, + &scores); + + ASSERT_EQ(ids.size(), 5UL); + ASSERT_EQ(scores.size(), 5UL); + + LoDTensor id_tensor_cpu; + LoDTensor score_tensor_cpu; + + paddle::operators::BeamSearchDecodeXPUFunctor bs_xpu( + ids, scores, &id_tensor_cpu, &score_tensor_cpu, 2, 1); + bs_xpu.apply_xpu(); + + LoD lod = id_tensor_cpu.lod(); + std::vector expect_source_lod = {0, 2, 4}; + ASSERT_EQ(lod[0], expect_source_lod); + + std::vector expect_sentence_lod = {0, 4, 7, 12, 17}; + ASSERT_EQ(lod[1], expect_sentence_lod); + + std::vector expect_data = { + 0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1}; + ASSERT_EQ(id_tensor_cpu.dims()[0], static_cast(expect_data.size())); + + for (size_t i = 0; i < expect_data.size(); ++i) { + ASSERT_EQ(id_tensor_cpu.data()[i], + static_cast(expect_data[i])); + } + + for (int64_t i = 0; i < id_tensor_cpu.dims()[0]; ++i) { + ASSERT_EQ(score_tensor_cpu.data()[i], + static_cast(id_tensor_cpu.data()[i])); + } +} diff --git a/paddle/fluid/operators/beam_search_op_xpu.cc b/paddle/fluid/operators/beam_search_op_xpu.cc index a2dea5214b8..ab52f09c2b6 100644 --- a/paddle/fluid/operators/beam_search_op_xpu.cc +++ b/paddle/fluid/operators/beam_search_op_xpu.cc @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef PADDLE_WITH_XPU + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/beam_search_op.h" @@ -22,3 +24,4 @@ REGISTER_OP_XPU_KERNEL( ops::BeamSearchOpKernel, ops::BeamSearchOpKernel, ops::BeamSearchOpKernel); +#endif diff --git a/paddle/fluid/operators/math/beam_search_xpu.cc b/paddle/fluid/operators/math/beam_search_xpu.cc index 85869845882..9904c142e5a 100644 --- a/paddle/fluid/operators/math/beam_search_xpu.cc +++ b/paddle/fluid/operators/math/beam_search_xpu.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/math/beam_search.h" namespace phi { @@ -28,6 +29,28 @@ class XPUDeviceContext; namespace paddle { namespace operators { namespace math { +template +int CopyData(const T *x, T **y, int len, const Place &place) { + if (nullptr == x || nullptr == y || len <= 0) + return xpu::Error_t::INVALID_PARAM; + + *y = reinterpret_cast(malloc(sizeof(T) * len)); + + paddle::memory::Copy( + paddle::platform::CPUPlace(), *y, place, x, len * sizeof(T)); + return xpu::Error_t::SUCCESS; +} + +template +void CopyDataByCondition(const T *x, T **y, int len, const Place &place) { + if (x != nullptr) { + int r = CopyData(x, y, len, place); + PADDLE_ENFORCE_EQ( + r, + xpu::Error_t::SUCCESS, + platform::errors::External("Copy data form xpu to cpu failed")); + } +} template class BeamSearchFunctor { @@ -54,7 +77,8 @@ class BeamSearchFunctor { level, beam_size, end_id, - is_accumulated); + is_accumulated, + ids->place()); auto selected_items = ToMap(items, high_level.back()); if (FLAGS_v == 3) { VLOG(3) << "selected_items:"; @@ -66,7 +90,8 @@ class BeamSearchFunctor { } } - PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id); + PruneEndBeams( + pre_ids, abs_lod, &selected_items, level, end_id, ids->place()); // calculate the output tensor's height size_t num_instances = std::accumulate( std::begin(selected_items), @@ -164,8 +189,13 @@ class BeamSearchFunctor { const framework::LoD &abs_lod, std::vector> *items, size_t lod_level, - int end_id) { - auto *pre_ids_data = pre_ids->data(); + int end_id, + const Place &place) { + auto *pre_ids_data_xpu = pre_ids->data(); + int64_t *pre_ids_data = nullptr; + CopyDataByCondition( + pre_ids_data_xpu, &pre_ids_data, pre_ids->numel(), place); + auto &high_level = abs_lod[lod_level]; for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { size_t src_prefix_start = high_level[src_idx]; @@ -189,6 +219,7 @@ class BeamSearchFunctor { items->at(offset).clear(); } } + free(pre_ids_data); } /* @@ -213,6 +244,7 @@ class BeamSearchFunctor { std::vector &top_beam = *top_beam_ptr; size_t num_beams = top_beam.size(); + if (num_beams < beam_size) { top_beam.resize(num_beams + 1); num_beams++; @@ -244,17 +276,31 @@ class BeamSearchFunctor { size_t lod_level, size_t beam_size, int end_id, - bool is_accumulated) { + bool is_accumulated, + const Place &place) { std::vector> result; // find the current candidates auto abs_lod = framework::ToAbsOffset(scores->lod()); - auto *pre_ids_data = pre_ids->data(); - auto *pre_scores_data = pre_scores->data(); + auto *pre_ids_data_xpu = pre_ids->data(); + int64_t *pre_ids_data = nullptr; + CopyDataByCondition( + pre_ids_data_xpu, &pre_ids_data, pre_ids->numel(), place); + + auto *pre_scores_data_xpu = pre_scores->data(); + float *pre_scores_data = nullptr; + CopyDataByCondition( + pre_scores_data_xpu, &pre_scores_data, pre_scores->numel(), place); + + auto *ids_data_xpu = ids ? ids->data() : nullptr; + int64_t *ids_data = nullptr; + CopyDataByCondition(ids_data_xpu, &ids_data, ids->numel(), place); - auto *ids_data = ids ? ids->data() : nullptr; - auto *scores_data = scores->data(); + auto *scores_data_xpu = scores->data(); + float *scores_data = nullptr; + CopyDataByCondition( + scores_data_xpu, &scores_data, scores->numel(), place); size_t num_seqs = scores->NumElements(lod_level); size_t seq_width = 1; @@ -273,6 +319,7 @@ class BeamSearchFunctor { ++offset) { auto pre_id = pre_ids_data[offset]; auto pre_score = pre_scores_data[offset]; + if (pre_id == end_id) { // Allocate all probability mass to end_id for finished branchs and // the other candidate ids can be ignored. @@ -285,6 +332,7 @@ class BeamSearchFunctor { float score = is_accumulated ? scores_data[index] : pre_score + std::log(scores_data[index]); + Item item(offset, id, score); Insert(&top_beam, item, beam_size); } @@ -304,6 +352,11 @@ class BeamSearchFunctor { } } + free(pre_ids_data); + free(pre_scores_data); + free(ids_data); + free(scores_data); + return result; } }; diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 7fedc9c7607..68bab4980a6 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -61,6 +61,12 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace())})}, + {"beam_search_decode", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, {"bilinear_interp_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"bilinear_interp_v2_grad", -- GitLab