未验证 提交 adaffb7b 编写于 作者: M mengqingchun02 提交者: GitHub

Support beam search decode op in XPU environment (#44917)

* support beam_search operator on xpu. test=kunlun

* support beam_search operator on xpu. test=kunlun

* support beam_search operator on xpu. test=kunlun

* support beam_search operator on xpu. test=kunlun

* support beam_search operator on xpu. test=kunlun

* support beam_search operator on xpu. test=kunlun

* support beam_search operator on xpu. test=kunlun

* fix beam_search operator bugs on xpu. test=kunlun

* fix beam_search operator bugs on xpu. test=kunlun

* fix beam_search operator bugs on xpu. test=kunlun

* fix beam_search operator bugs on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun

* support beam_search_decode operator on xpu. test=kunlun
上级 dc331231
......@@ -163,6 +163,7 @@ if (WITH_GPU OR WITH_ROCM)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
endif()
if(WITH_XPU)
cc_test(beam_search_decode_op_xpu_test SRCS beam_search_decode_op_xpu_test.cc DEPS lod_tensor)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib)
endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
......
......@@ -13,195 +13,24 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/beam_search_decode_op.h"
#include <string>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class InferShapeContext;
class OpDesc;
class Scope;
template <typename T>
class EmptyGradOpMaker;
} // namespace framework
namespace imperative {
class OpBase;
} // namespace imperative
} // namespace paddle
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
struct BeamSearchDecodeFunctor {
BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores,
LoDTensor* id_tensor,
LoDTensor* score_tensor,
size_t beam_size,
int end_id)
: beam_size_(beam_size),
end_id_(end_id),
step_ids_origin_(step_ids),
step_scores_origin_(step_scores),
id_tensor_(id_tensor),
score_tensor_(score_tensor) {
tensor_on_gpu_ = false;
tensor_on_npu_ = false;
// First make a copy of GPU data on CPU
if (platform::is_gpu_place(step_ids_origin_[0].place()) ||
platform::is_npu_place(step_ids_origin_[0].place())) {
if (platform::is_gpu_place(step_ids_origin_[0].place())) {
tensor_on_gpu_ = true;
} else {
tensor_on_npu_ = true;
}
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(step_ids_origin_[0].place());
// Copy all tensors in the input tensor array
for (auto& step_id : step_ids_origin_) {
framework::LoDTensor out;
if (step_id.numel() > 0) {
if (tensor_on_gpu_) {
dev_ctx->Wait();
}
framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
dev_ctx->Wait();
}
out.set_lod(step_id.lod());
step_ids_.push_back(out);
}
}
if (platform::is_gpu_place(step_scores_origin_[0].place()) ||
platform::is_npu_place(step_scores_origin_[0].place())) {
if (platform::is_gpu_place(step_scores_origin_[0].place())) {
tensor_on_gpu_ = true;
} else {
tensor_on_npu_ = true;
}
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(step_scores_origin_[0].place());
// Copy all tensors in the input tensor array
for (auto& step_score : step_scores_origin_) {
framework::LoDTensor out;
if (step_score.numel() > 0) {
if (tensor_on_gpu_) {
dev_ctx->Wait();
}
framework::TensorCopy(
step_score, platform::CPUPlace(), *dev_ctx, &out);
dev_ctx->Wait();
}
out.set_lod(step_score.lod());
step_scores_.push_back(out);
}
}
}
template <typename T>
void apply() const;
bool tensor_on_gpu_;
bool tensor_on_npu_;
size_t beam_size_;
int end_id_;
// TODO(Superjomn) Here might result serious performance issue in the
// concurrency
// scenarios.
const LoDTensorArray& step_ids_origin_;
const LoDTensorArray& step_scores_origin_;
LoDTensorArray step_ids_ = LoDTensorArray();
LoDTensorArray step_scores_ = LoDTensorArray();
LoDTensor* id_tensor_;
LoDTensor* score_tensor_;
};
template <typename T>
void BeamSearchDecodeFunctor::apply() const {
BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
// Check if the tensor is on GPU or NPU. If so, use the CPU copy instead
if (tensor_on_gpu_ || tensor_on_npu_) {
beam_search_decoder.Backtrace(
step_ids_, step_scores_, id_tensor_, score_tensor_);
} else {
beam_search_decoder.Backtrace(
step_ids_origin_, step_scores_origin_, id_tensor_, score_tensor_);
}
}
template <>
void BeamSearchDecodeFunctor::apply<bool>() const {
PADDLE_THROW(platform::errors::InvalidArgument(
"beam search decode op does not support bool!"));
}
class BeamSearchDecodeOp : public framework::OperatorBase {
class BeamSearchDecodeOp : public framework::OperatorWithKernel {
public:
BeamSearchDecodeOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& dev_ctx = *pool.Get(dev_place);
framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope);
framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx);
const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
const size_t step_num = ids->size();
PADDLE_ENFORCE_GT(
step_num,
0UL,
platform::errors::InvalidArgument(
"beam search steps, which is the"
"size of Input(Ids) LoDTensorArray. beam search steps should "
"be larger than 0, but received %d. ",
step_num));
const size_t source_num = ids->at(0).lod().at(0).size() - 1;
PADDLE_ENFORCE_GT(
source_num,
0UL,
platform::errors::InvalidArgument(
"source_num is the sequence number of the"
"first decoding step, indicating by Input(Ids)[0].lod[0].size. "
"The number of source_num should be larger than"
"0, but received %d. ",
source_num));
using framework::OperatorWithKernel::OperatorWithKernel;
for (size_t i = 0; i < step_num; ++i) {
PADDLE_ENFORCE_EQ(
ids->at(i).lod().size(),
2UL,
platform::errors::InvalidArgument(
"For the i step in beam search steps,"
"the size of Input(Ids)[i].lod() should larger than 2,"
"but received %d. ",
ids->at(i).lod().size()));
void InferShape(framework::InferShapeContext *ctx) const override {
for (const std::string &arg : std::vector<std::string>({"Ids", "Scores"})) {
OP_INOUT_CHECK(ctx->HasInput(arg), "Input", arg, "BeamSeachDecode");
}
for (const std::string &arg :
std::vector<std::string>({"SentenceIds", "SentenceScores"})) {
OP_INOUT_CHECK(ctx->HasOutput(arg), "Output", arg, "BeamSeachDecode");
}
size_t beam_size = ctx.Attr<int>("beam_size");
int end_id = ctx.Attr<int>("end_id");
// prepare output
LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
framework::VisitDataType(
framework::TransToProtoVarType(scores->at(0).dtype()),
BeamSearchDecodeFunctor(
*ids, *scores, sentenceIds, sentenceScores, beam_size, end_id));
}
};
......@@ -240,7 +69,7 @@ hypothesis has.
}
};
class BeamSearchDecodeInferShape : public framework::InferShapeBase {
/*class BeamSearchDecodeInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* context) const override {
OP_INOUT_CHECK(
......@@ -256,11 +85,11 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {
"SentenceScores",
"BeamSearchDecode");
}
};
};*/
class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext* ctx) const override {
void operator()(framework::InferVarTypeContext *ctx) const override {
ctx->SetOutputType("SentenceIds",
framework::proto::VarType::LOD_TENSOR,
framework::ALL_ELEMENTS);
......@@ -273,11 +102,16 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(
namespace ops = paddle::operators;
REGISTER_OPERATOR(beam_search_decode,
paddle::operators::BeamSearchDecodeOp,
paddle::operators::BeamSearchDecodeOpProtoMaker,
paddle::operators::BeamSearchDecodeInferVarType);
REGISTER_OP_CPU_KERNEL(
beam_search_decode,
paddle::operators::BeamSearchDecodeOp,
paddle::operators::BeamSearchDecodeOpProtoMaker,
paddle::operators::BeamSearchDecodeInferShape,
paddle::operators::BeamSearchDecodeInferVarType,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
ops::BeamSearchDecodeOpKernel<phi::CPUContext, float>,
ops::BeamSearchDecodeOpKernel<phi::CPUContext, double>,
ops::BeamSearchDecodeOpKernel<phi::CPUContext, paddle::platform::float16>,
ops::BeamSearchDecodeOpKernel<phi::CPUContext, int>,
ops::BeamSearchDecodeOpKernel<phi::CPUContext, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/beam_search_decode_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
beam_search_decode,
ops::BeamSearchDecodeOpKernel<phi::GPUContext, float>,
ops::BeamSearchDecodeOpKernel<phi::GPUContext, double>,
ops::BeamSearchDecodeOpKernel<phi::GPUContext, paddle::platform::float16>,
ops::BeamSearchDecodeOpKernel<phi::GPUContext, int>,
ops::BeamSearchDecodeOpKernel<phi::GPUContext, int64_t>);
......@@ -14,231 +14,164 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <memory>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/beam_search_decode_op_def.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using LoDTensorArray = framework::LoDTensorArray;
struct BeamSearchDecodeFunctor {
BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores,
LoDTensor* id_tensor,
LoDTensor* score_tensor,
size_t beam_size,
int end_id)
: beam_size_(beam_size),
end_id_(end_id),
step_ids_origin_(step_ids),
step_scores_origin_(step_scores),
id_tensor_(id_tensor),
score_tensor_(score_tensor) {
tensor_on_gpu_ = false;
tensor_on_npu_ = false;
// First make a copy of GPU data on CPU
if (platform::is_gpu_place(step_ids_origin_[0].place()) ||
platform::is_npu_place(step_ids_origin_[0].place())) {
if (platform::is_gpu_place(step_ids_origin_[0].place())) {
tensor_on_gpu_ = true;
} else {
tensor_on_npu_ = true;
}
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(step_ids_origin_[0].place());
// Copy all tensors in the input tensor array
for (auto& step_id : step_ids_origin_) {
framework::LoDTensor out;
if (step_id.numel() > 0) {
if (tensor_on_gpu_) {
dev_ctx->Wait();
}
framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
dev_ctx->Wait();
}
// all the lod have 2 levels.
// The first is source level, the second is sentence level.
// source level describe how many prefixes (branchs) for each source sentece
// (beam). sentence level describe how these candidates belong to the prefixes.
const size_t kSourceLevel = 0;
const size_t kSentenceLevel = 1;
out.set_lod(step_id.lod());
step_ids_.push_back(out);
}
}
if (platform::is_gpu_place(step_scores_origin_[0].place()) ||
platform::is_npu_place(step_scores_origin_[0].place())) {
if (platform::is_gpu_place(step_scores_origin_[0].place())) {
tensor_on_gpu_ = true;
} else {
tensor_on_npu_ = true;
}
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(step_scores_origin_[0].place());
// Copy all tensors in the input tensor array
for (auto& step_score : step_scores_origin_) {
framework::LoDTensor out;
if (step_score.numel() > 0) {
if (tensor_on_gpu_) {
dev_ctx->Wait();
}
framework::TensorCopy(
step_score, platform::CPUPlace(), *dev_ctx, &out);
dev_ctx->Wait();
}
template <typename T>
struct Sentence {
std::vector<int64_t> word_ids;
std::vector<T> scores;
};
out.set_lod(step_score.lod());
step_scores_.push_back(out);
}
}
}
template <typename T>
using SentenceVector = std::vector<Sentence<T>>;
template <typename T>
struct BeamSearchDecoder {
BeamSearchDecoder(size_t beam_size, int end_id)
: beam_size_(beam_size), end_id_(end_id) {}
/**
* convert the result sentence_vector for each source sentence into two
* LodTensor.
* One is all candidate sentences with word id, one is all candidate sentences
* with word score.
* Param:
* sentence_vector_list: sentence_vector for each source sentence.
* id_tensor: result LoDTensor for sentences of id.
* score_tensor: result LoDTensor for sentences of score.
* reverse: whether ids of sentence in sentence_vector_list is reversed
* sort_by_score: whether to sort hypotheses of each sentence by scores.
*/
void ConvertSentenceVectorToLodTensor(
std::vector<SentenceVector<T>> sentence_vector_list,
LoDTensor* id_tensor,
LoDTensor* score_tensor,
bool reverse = true,
bool sort_by_score = true) const;
/**
* Gather the hypotheses for each source sentence by backtrace though the
* LoDTensorArray step_ids whose lods reserve the path in the tree.
*/
void Backtrace(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores,
LoDTensor* id_tensor,
LoDTensor* score_tensor) const;
template <typename T>
void apply_mix() const {
if (std::is_same<bool, T>::value) {
PADDLE_THROW(platform::errors::InvalidArgument(
"beam search decode op does not support bool!"));
} else {
BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
// Check if the tensor is on GPU or NPU. If so, use the CPU copy instead
if (tensor_on_gpu_ || tensor_on_npu_) {
beam_search_decoder.Backtrace(
step_ids_, step_scores_, id_tensor_, score_tensor_);
} else {
beam_search_decoder.Backtrace(
step_ids_origin_, step_scores_origin_, id_tensor_, score_tensor_);
}
}
}
bool tensor_on_gpu_;
bool tensor_on_npu_;
size_t beam_size_;
int end_id_;
// TODO(Superjomn) Here might result serious performance issue in the
// concurrency
// scenarios.
const LoDTensorArray& step_ids_origin_;
const LoDTensorArray& step_scores_origin_;
LoDTensorArray step_ids_ = LoDTensorArray();
LoDTensorArray step_scores_ = LoDTensorArray();
LoDTensor* id_tensor_;
LoDTensor* score_tensor_;
};
template <typename T>
void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
std::vector<SentenceVector<T>> sentence_vector_list,
LoDTensor* id_tensor,
LoDTensor* score_tensor,
bool reverse,
bool sort_by_score) const {
size_t src_num = sentence_vector_list.size();
PADDLE_ENFORCE_NE(
src_num,
0,
platform::errors::InvalidArgument(
"src_num is the sequence number of the first decoding step"
", indicating by Input(Ids)[0].lod[0].size."
"src_num has wrong value."
"src_num should not be 0,"
"But received %d.",
src_num));
std::vector<size_t> source_level_lod = {0};
std::vector<size_t> sentence_level_lod = {0};
std::vector<int64_t> id_data;
std::vector<T> score_data;
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
if (sort_by_score) {
sort(sentence_vector_list[src_idx].begin(),
sentence_vector_list[src_idx].end(),
[reverse](const Sentence<T>& a, const Sentence<T>& b) {
if (reverse)
return a.scores.front() > b.scores.front();
else
return a.scores.back() > b.scores.back();
});
template <typename DeviceContext, typename T>
class BeamSearchDecodeOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const LoDTensorArray* ids = context.Input<LoDTensorArray>("Ids");
const LoDTensorArray* scores = context.Input<LoDTensorArray>("Scores");
const size_t step_num = ids->size();
PADDLE_ENFORCE_GT(
step_num,
0UL,
platform::errors::InvalidArgument(
"beam search steps, which is the"
"size of Input(Ids) LoDTensorArray. beam search steps should "
"be larger than 0, but received %d. ",
step_num));
const size_t source_num = ids->at(0).lod().at(0).size() - 1;
PADDLE_ENFORCE_GT(
source_num,
0UL,
platform::errors::InvalidArgument(
"source_num is the sequence number of the"
"first decoding step, indicating by Input(Ids)[0].lod[0].size. "
"The number of source_num should be larger than"
"0, but received %d. ",
source_num));
for (size_t i = 0; i < step_num; ++i) {
PADDLE_ENFORCE_EQ(
ids->at(i).lod().size(),
2UL,
platform::errors::InvalidArgument(
"For the i step in beam search steps,"
"the size of Input(Ids)[i].lod() should larger than 2,"
"but received %d. ",
ids->at(i).lod().size()));
}
for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
if (reverse) {
id_data.insert(id_data.end(),
sentence.word_ids.rbegin(),
sentence.word_ids.rend());
score_data.insert(
score_data.end(), sentence.scores.rbegin(), sentence.scores.rend());
} else {
id_data.insert(
id_data.end(), sentence.word_ids.begin(), sentence.word_ids.end());
score_data.insert(
score_data.end(), sentence.scores.begin(), sentence.scores.end());
}
sentence_level_lod.push_back(sentence_level_lod.back() +
sentence.word_ids.size());
}
source_level_lod.push_back(source_level_lod.back() +
sentence_vector_list[src_idx].size());
}
size_t beam_size = context.Attr<int>("beam_size");
int end_id = context.Attr<int>("end_id");
auto cpu_place = std::unique_ptr<paddle::platform::CPUPlace>(
new paddle::platform::CPUPlace());
phi::CPUContext cpu_ctx(*cpu_place);
framework::LoD lod;
lod.push_back(source_level_lod);
lod.push_back(sentence_level_lod);
id_tensor->set_lod(lod);
id_tensor->Resize({static_cast<int64_t>(id_data.size())});
id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
framework::TensorFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
score_tensor->set_lod(lod);
score_tensor->Resize({static_cast<int64_t>(score_data.size())});
score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
framework::TensorFromVector<T>(score_data, cpu_ctx, score_tensor);
}
template <typename T>
void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores,
LoDTensor* id_tensor,
LoDTensor* score_tensor) const {
PADDLE_ENFORCE_NE(
step_ids.empty(),
true,
platform::errors::InvalidArgument("Input(Ids) should not be empty."
"But the Input(Ids) is empty."));
PADDLE_ENFORCE_EQ(
step_ids.size(),
step_scores.size(),
platform::errors::InvalidArgument(
"The size of Input(Ids) and Input(Scores) should be "
"the same. But the size of Input(Ids) and Input(Scores) "
"are not equal."));
const size_t step_num = step_ids.size();
const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
std::vector<SentenceVector<T>> sentence_vector_list(
src_num, SentenceVector<T>(beam_size_));
std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
for (int step_id = step_num - 1; step_id >= 0; --step_id) {
auto& cur_ids = step_ids.at(step_id);
auto& cur_scores = step_scores.at(step_id);
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
// for each source sentence
auto& sentence_vector = sentence_vector_list.at(src_idx);
auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
if (prefix_idx_vector.empty()) { // be finished and pruned at this step
// or the last time step
for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
++prefix_idx) {
size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
size_t candidate_end =
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
for (size_t candidate_idx = candidate_start;
candidate_idx < candidate_end;
++candidate_idx) {
prefix_idx_vector.push_back(prefix_idx);
size_t idx = prefix_idx_vector.size() - 1;
auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
auto cur_score = cur_scores.data<T>()[candidate_idx];
sentence_vector.at(idx).word_ids.push_back(cur_id);
sentence_vector.at(idx).scores.push_back(cur_score);
}
}
} else { // use prefix_idx_vector to backtrace
size_t src_candidate_start =
cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
size_t prefix_idx = src_prefix_start;
size_t candidate_num =
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
cur_ids.lod().at(kSentenceLevel)[prefix_idx];
for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
auto candidate_idx = prefix_idx_vector.at(idx);
auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
auto cur_score = cur_scores.data<T>()[candidate_idx];
if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
// to skip redundant end tokens
sentence_vector.at(idx).word_ids.push_back(cur_id);
sentence_vector.at(idx).scores.push_back(cur_score);
}
// prepare output
LoDTensor* sentenceIds = context.Output<LoDTensor>("SentenceIds");
LoDTensor* sentenceScores = context.Output<LoDTensor>("SentenceScores");
while (src_candidate_start + candidate_num <=
candidate_idx) { // search the corresponding prefix
prefix_idx++;
candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
cur_ids.lod().at(kSentenceLevel)[prefix_idx];
}
prefix_idx_vector.at(idx) = prefix_idx;
}
}
}
BeamSearchDecodeFunctor bs(
*ids, *scores, sentenceIds, sentenceScores, beam_size, end_id);
bs.apply_mix<T>();
}
ConvertSentenceVectorToLodTensor(
sentence_vector_list, id_tensor, score_tensor, true, true);
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <memory>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using LoDTensorArray = framework::LoDTensorArray;
// all the lod have 2 levels.
// The first is source level, the second is sentence level.
// source level describe how many prefixes (branchs) for each source sentece
// (beam). sentence level describe how these candidates belong to the prefixes.
const size_t kSourceLevel = 0;
const size_t kSentenceLevel = 1;
template <typename T>
struct Sentence {
std::vector<int64_t> word_ids;
std::vector<T> scores;
};
template <typename T>
using SentenceVector = std::vector<Sentence<T>>;
template <typename T>
struct BeamSearchDecoder {
BeamSearchDecoder(size_t beam_size, int end_id)
: beam_size_(beam_size), end_id_(end_id) {}
/**
* convert the result sentence_vector for each source sentence into two
* LodTensor.
* One is all candidate sentences with word id, one is all candidate sentences
* with word score.
* Param:
* sentence_vector_list: sentence_vector for each source sentence.
* id_tensor: result LoDTensor for sentences of id.
* score_tensor: result LoDTensor for sentences of score.
* reverse: whether ids of sentence in sentence_vector_list is reversed
* sort_by_score: whether to sort hypotheses of each sentence by scores.
*/
void ConvertSentenceVectorToLodTensor(
std::vector<SentenceVector<T>> sentence_vector_list,
LoDTensor* id_tensor,
LoDTensor* score_tensor,
bool reverse = true,
bool sort_by_score = true) const;
/**
* Gather the hypotheses for each source sentence by backtrace though the
* LoDTensorArray step_ids whose lods reserve the path in the tree.
*/
void Backtrace(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores,
LoDTensor* id_tensor,
LoDTensor* score_tensor) const;
size_t beam_size_;
int end_id_;
};
template <typename T>
void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
std::vector<SentenceVector<T>> sentence_vector_list,
LoDTensor* id_tensor,
LoDTensor* score_tensor,
bool reverse,
bool sort_by_score) const {
size_t src_num = sentence_vector_list.size();
PADDLE_ENFORCE_NE(
src_num,
0,
platform::errors::InvalidArgument(
"src_num is the sequence number of the first decoding step"
", indicating by Input(Ids)[0].lod[0].size."
"src_num has wrong value."
"src_num should not be 0,"
"But received %d.",
src_num));
std::vector<size_t> source_level_lod = {0};
std::vector<size_t> sentence_level_lod = {0};
std::vector<int64_t> id_data;
std::vector<T> score_data;
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
if (sort_by_score) {
sort(sentence_vector_list[src_idx].begin(),
sentence_vector_list[src_idx].end(),
[reverse](const Sentence<T>& a, const Sentence<T>& b) {
if (reverse)
return a.scores.front() > b.scores.front();
else
return a.scores.back() > b.scores.back();
});
}
for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
if (reverse) {
id_data.insert(id_data.end(),
sentence.word_ids.rbegin(),
sentence.word_ids.rend());
score_data.insert(
score_data.end(), sentence.scores.rbegin(), sentence.scores.rend());
} else {
id_data.insert(
id_data.end(), sentence.word_ids.begin(), sentence.word_ids.end());
score_data.insert(
score_data.end(), sentence.scores.begin(), sentence.scores.end());
}
sentence_level_lod.push_back(sentence_level_lod.back() +
sentence.word_ids.size());
}
source_level_lod.push_back(source_level_lod.back() +
sentence_vector_list[src_idx].size());
}
auto cpu_place = std::unique_ptr<paddle::platform::CPUPlace>(
new paddle::platform::CPUPlace());
phi::CPUContext cpu_ctx(*cpu_place);
framework::LoD lod;
lod.push_back(source_level_lod);
lod.push_back(sentence_level_lod);
id_tensor->set_lod(lod);
id_tensor->Resize({static_cast<int64_t>(id_data.size())});
id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
framework::TensorFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
score_tensor->set_lod(lod);
score_tensor->Resize({static_cast<int64_t>(score_data.size())});
score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
framework::TensorFromVector<T>(score_data, cpu_ctx, score_tensor);
}
template <typename T>
void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores,
LoDTensor* id_tensor,
LoDTensor* score_tensor) const {
PADDLE_ENFORCE_NE(
step_ids.empty(),
true,
platform::errors::InvalidArgument("Input(Ids) should not be empty."
"But the Input(Ids) is empty."));
PADDLE_ENFORCE_EQ(
step_ids.size(),
step_scores.size(),
platform::errors::InvalidArgument(
"The size of Input(Ids) and Input(Scores) should be "
"the same. But the size of Input(Ids) and Input(Scores) "
"are not equal."));
const size_t step_num = step_ids.size();
const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
std::vector<SentenceVector<T>> sentence_vector_list(
src_num, SentenceVector<T>(beam_size_));
std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
for (int step_id = step_num - 1; step_id >= 0; --step_id) {
auto& cur_ids = step_ids.at(step_id);
auto& cur_scores = step_scores.at(step_id);
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
// for each source sentence
auto& sentence_vector = sentence_vector_list.at(src_idx);
auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
if (prefix_idx_vector.empty()) { // be finished and pruned at this step
// or the last time step
for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
++prefix_idx) {
size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
size_t candidate_end =
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
for (size_t candidate_idx = candidate_start;
candidate_idx < candidate_end;
++candidate_idx) {
prefix_idx_vector.push_back(prefix_idx);
size_t idx = prefix_idx_vector.size() - 1;
auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
auto cur_score = cur_scores.data<T>()[candidate_idx];
sentence_vector.at(idx).word_ids.push_back(cur_id);
sentence_vector.at(idx).scores.push_back(cur_score);
}
}
} else { // use prefix_idx_vector to backtrace
size_t src_candidate_start =
cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
size_t prefix_idx = src_prefix_start;
size_t candidate_num =
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
cur_ids.lod().at(kSentenceLevel)[prefix_idx];
for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
auto candidate_idx = prefix_idx_vector.at(idx);
auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
auto cur_score = cur_scores.data<T>()[candidate_idx];
if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
// to skip redundant end tokens
sentence_vector.at(idx).word_ids.push_back(cur_id);
sentence_vector.at(idx).scores.push_back(cur_score);
}
while (src_candidate_start + candidate_num <=
candidate_idx) { // search the corresponding prefix
prefix_idx++;
candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
cur_ids.lod().at(kSentenceLevel)[prefix_idx];
}
prefix_idx_vector.at(idx) = prefix_idx;
}
}
}
}
ConvertSentenceVectorToLodTensor(
sentence_vector_list, id_tensor, score_tensor, true, true);
}
} // namespace operators
}; // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/beam_search_decode_op_xpu.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class BeamSearchDecodeXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const LoDTensorArray* ids = context.Input<LoDTensorArray>("Ids");
const LoDTensorArray* scores = context.Input<LoDTensorArray>("Scores");
const size_t step_num = ids->size();
PADDLE_ENFORCE_GT(
step_num,
0UL,
platform::errors::InvalidArgument(
"beam search steps, which is the"
"size of Input(Ids) LoDTensorArray. beam search steps should "
"be larger than 0, but received %d. ",
step_num));
const size_t source_num = ids->at(0).lod().at(0).size() - 1;
PADDLE_ENFORCE_GT(
source_num,
0UL,
platform::errors::InvalidArgument(
"source_num is the sequence number of the"
"first decoding step, indicating by Input(Ids)[0].lod[0].size. "
"The number of source_num should be larger than"
"0, but received %d. ",
source_num));
for (size_t i = 0; i < step_num; ++i) {
PADDLE_ENFORCE_EQ(
ids->at(i).lod().size(),
2UL,
platform::errors::InvalidArgument(
"For the i step in beam search steps,"
"the size of Input(Ids)[i].lod() should larger than 2,"
"but received %d. ",
ids->at(i).lod().size()));
}
size_t beam_size = context.Attr<int>("beam_size");
int end_id = context.Attr<int>("end_id");
// prepare output
LoDTensor* sentenceIds = nullptr;
LoDTensor* sentenceScores = nullptr;
LoDTensor* sentenceIds_temp = context.Output<LoDTensor>("SentenceIds");
LoDTensor* sentenceScores_temp =
context.Output<LoDTensor>("SentenceScores");
if (platform::is_xpu_place(ids->at(0).place())) {
sentenceIds = new LoDTensor();
sentenceIds->set_lod(sentenceIds_temp->lod());
}
if (platform::is_xpu_place(ids->at(0).place())) {
sentenceScores = new LoDTensor();
sentenceScores->set_lod(sentenceScores_temp->lod());
}
BeamSearchDecodeXPUFunctor bs_xpu(
*ids, *scores, sentenceIds, sentenceScores, beam_size, end_id);
bs_xpu.apply_xpu<T>();
if (platform::is_xpu_place(ids->at(0).place())) {
int r = 0;
r = CopyTensorByXPU<int64_t>(
*sentenceIds, sentenceIds_temp, 1, ids->at(0).place());
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External(
"Execute function CopyTensorByXPU failed by [%d]", r));
r = CopyTensorByType(
*sentenceScores, sentenceScores_temp, 1, ids->at(0).place());
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External(
"Execute function CopyTensorByXPU failed by [%d]", r));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
beam_search_decode,
ops::BeamSearchDecodeXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::BeamSearchDecodeXPUKernel<paddle::platform::XPUDeviceContext, double>,
ops::BeamSearchDecodeXPUKernel<paddle::platform::XPUDeviceContext,
paddle::platform::float16>,
ops::BeamSearchDecodeXPUKernel<paddle::platform::XPUDeviceContext, int>,
ops::BeamSearchDecodeXPUKernel<paddle::platform::XPUDeviceContext,
int64_t>);
#endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/operators/beam_search_decode_op_def.h"
namespace paddle {
namespace operators {
int SetMeta(const LoDTensor& srcTensor, LoDTensor* dstTensor) {
if (srcTensor.dtype() == paddle::experimental::DataType::INT32 ||
srcTensor.dtype() == paddle::experimental::DataType::INT64 ||
srcTensor.dtype() == paddle::experimental::DataType::FLOAT32 ||
srcTensor.dtype() == paddle::experimental::DataType::FLOAT16 ||
srcTensor.dtype() == paddle::experimental::DataType::FLOAT64) {
const phi::DenseTensorMeta meta_data(srcTensor.dtype(), srcTensor.dims());
dstTensor->set_meta(meta_data);
} else {
return xpu::Error_t::INVALID_PARAM;
}
return xpu::Error_t::SUCCESS;
}
template <typename T>
int CopyTensorByXPU(const LoDTensor& srcTensor,
LoDTensor* dstTensor,
int flag,
const Place& place) {
const T* srcData = srcTensor.template data<T>();
if (nullptr == srcData || nullptr == dstTensor || flag < 0 || flag > 1)
return xpu::Error_t::INVALID_PARAM;
int r = SetMeta(srcTensor, dstTensor);
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External("Execute function SetMeta failed by [%d]", r));
if (flag == 0) {
T* dstData =
dstTensor->template mutable_data<T>(paddle::platform::CPUPlace());
paddle::memory::Copy(paddle::platform::CPUPlace(),
dstData,
place,
srcData,
srcTensor.numel() * sizeof(T));
} else {
T* dstData = dstTensor->template mutable_data<T>(place);
paddle::memory::Copy(place,
dstData,
paddle::platform::CPUPlace(),
srcData,
srcTensor.numel() * sizeof(T));
}
return xpu::Error_t::SUCCESS;
}
const int CopyTensorByType(const LoDTensor& srcTensor,
LoDTensor* dstTensor,
int flag,
const Place& place) {
int r = 0;
if (srcTensor.dtype() == paddle::experimental::DataType::FLOAT32)
r = CopyTensorByXPU<float>(srcTensor, dstTensor, flag, place);
else if (srcTensor.dtype() == paddle::experimental::DataType::FLOAT16)
r = CopyTensorByXPU<paddle::platform::float16>(
srcTensor, dstTensor, flag, place);
else if (srcTensor.dtype() == paddle::experimental::DataType::FLOAT64)
r = CopyTensorByXPU<double>(srcTensor, dstTensor, flag, place);
else if (srcTensor.dtype() == paddle::experimental::DataType::INT32)
r = CopyTensorByXPU<int>(srcTensor, dstTensor, flag, place);
else if (srcTensor.dtype() == paddle::experimental::DataType::INT64)
r = CopyTensorByXPU<int64_t>(srcTensor, dstTensor, flag, place);
else
return xpu::Error_t::INVALID_PARAM;
PADDLE_ENFORCE_EQ(r,
xpu::Error_t::SUCCESS,
platform::errors::External(
"Execute function CopyTensorByXPU failed by [%d]", r));
return xpu::Error_t::SUCCESS;
}
struct BeamSearchDecodeXPUFunctor {
BeamSearchDecodeXPUFunctor(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores,
LoDTensor* id_tensor,
LoDTensor* score_tensor,
size_t beam_size,
int end_id)
: beam_size_(beam_size),
end_id_(end_id),
id_tensor_(id_tensor),
score_tensor_(score_tensor) {
int r = 0;
// First make a copy of XPU data on CPU
if (platform::is_xpu_place(step_ids[0].place())) {
// Copy all tensors in the input tensor array
for (auto& step_id : step_ids) {
framework::LoDTensor out;
if (step_id.numel() > 0) {
r = CopyTensorByType(step_id, &out, 0, step_ids[0].place());
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External(
"Execute function CopyTensorByXPU failed by [%d]", r));
}
out.set_lod(step_id.lod());
step_ids_.push_back(out);
}
}
if (platform::is_xpu_place(step_scores[0].place())) {
// Copy all tensors in the input tensor array
for (auto& step_score : step_scores) {
framework::LoDTensor out;
if (step_score.numel() > 0) {
r = CopyTensorByType(step_score, &out, 0, step_scores[0].place());
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External(
"Execute function CopyTensorByType failed by [%d]", r));
}
out.set_lod(step_score.lod());
step_scores_.push_back(out);
}
}
}
template <typename T>
void apply_xpu() const {
if (std::is_same<bool, T>::value) {
PADDLE_THROW(platform::errors::InvalidArgument(
"beam search decode op does not support bool!"));
} else {
BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
beam_search_decoder.Backtrace(
step_ids_, step_scores_, id_tensor_, score_tensor_);
}
}
size_t beam_size_;
int end_id_;
// TODO(Superjomn) Here might result serious performance issue in the
// concurrency
// scenarios.
LoDTensorArray step_ids_ = LoDTensorArray();
LoDTensorArray step_scores_ = LoDTensorArray();
LoDTensor* id_tensor_;
LoDTensor* score_tensor_;
};
} // namespace operators
}; // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/beam_search_decode_op_xpu.h"
#include "gtest/gtest.h"
using CPUPlace = paddle::platform::CPUPlace;
using XPUPlace = paddle::platform::XPUPlace;
using LoD = paddle::framework::LoD;
using LoDTensor = paddle::framework::LoDTensor;
using LoDTensorArray = paddle::framework::LoDTensorArray;
template <typename T>
using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
template <typename T>
using Sentence = paddle::operators::Sentence<T>;
template <typename T>
using SentenceVector = paddle::operators::SentenceVector<T>;
namespace paddle {
namespace test {
void GenerateXPUExample(const std::vector<size_t>& level_0,
const std::vector<size_t>& level_1,
const std::vector<int>& data,
LoDTensorArray* ids,
LoDTensorArray* scores) {
PADDLE_ENFORCE_EQ(level_0.back(),
level_1.size() - 1,
platform::errors::InvalidArgument(
"source level is used to describe candidate set"
", so it's element should less than levle_1 length. "
"And the value of source"
"level is %d. ",
level_1.size() - 1));
PADDLE_ENFORCE_EQ(level_1.back(),
data.size(),
platform::errors::InvalidArgument(
"the lowest level is used to describe data"
", so it's last element should be data length %d. ",
data.size()));
CPUPlace place;
int XPU_PlaceNo = 0;
if (std::getenv("FLAGS_selected_xpus") != nullptr)
XPU_PlaceNo = atoi(std::getenv("FLAGS_selected_xpus"));
else if (std::getenv("XPU_VISIBLE_DEVICES") != nullptr)
XPU_PlaceNo = atoi(std::getenv("XPU_VISIBLE_DEVICES"));
XPUPlace xpu_place(XPU_PlaceNo);
LoD lod;
lod.push_back(level_0);
lod.push_back(level_1);
// Ids
LoDTensor tensor_id_cpu;
tensor_id_cpu.set_lod(lod);
tensor_id_cpu.Resize({static_cast<int64_t>(data.size())});
// malloc memory
int64_t* id_cpu_ptr = tensor_id_cpu.mutable_data<int64_t>(place);
for (size_t i = 0; i < data.size(); ++i) {
id_cpu_ptr[i] = static_cast<int64_t>(data.at(i));
}
LoDTensor tensor_id;
const phi::DenseTensorMeta meta_data_id(paddle::experimental::DataType::INT64,
tensor_id_cpu.dims());
tensor_id.set_meta(meta_data_id);
tensor_id.set_lod(lod);
int64_t* id_ptr = tensor_id.mutable_data<int64_t>(xpu_place);
paddle::memory::Copy(paddle::platform::XPUPlace(XPU_PlaceNo),
id_ptr,
paddle::platform::CPUPlace(),
id_cpu_ptr,
tensor_id_cpu.numel() * sizeof(int64_t));
// Scores
LoDTensor tensor_score_cpu;
tensor_score_cpu.set_lod(lod);
tensor_score_cpu.Resize({static_cast<int64_t>(data.size())});
// malloc memory
float* score_cpu_ptr = tensor_score_cpu.mutable_data<float>(place);
for (size_t i = 0; i < data.size(); ++i) {
score_cpu_ptr[i] = static_cast<float>(data.at(i));
}
LoDTensor tensor_score;
const phi::DenseTensorMeta meta_data_score(
paddle::experimental::DataType::FLOAT32, tensor_score_cpu.dims());
tensor_score.set_meta(meta_data_score);
tensor_score.set_lod(lod);
float* score_ptr = tensor_score.mutable_data<float>(xpu_place);
paddle::memory::Copy(paddle::platform::XPUPlace(XPU_PlaceNo),
score_ptr,
paddle::platform::CPUPlace(),
score_cpu_ptr,
tensor_score_cpu.numel() * sizeof(float));
ids->push_back(tensor_id);
scores->push_back(tensor_score);
}
} // namespace test
} // namespace paddle
TEST(BeamSearchDecodeOpXPU, Backtrace) {
CPUPlace place;
// Construct sample data with 5 steps and 2 source sentences
// beam_size = 2, start_id = 0, end_id = 1
LoDTensorArray ids;
LoDTensorArray scores;
paddle::test::GenerateXPUExample(std::vector<size_t>{0, 1, 2},
std::vector<size_t>{0, 1, 2},
std::vector<int>{0, 0},
&ids,
&scores); // start with start_id
paddle::test::GenerateXPUExample(std::vector<size_t>{0, 1, 2},
std::vector<size_t>{0, 2, 4},
std::vector<int>{2, 3, 4, 5},
&ids,
&scores);
paddle::test::GenerateXPUExample(std::vector<size_t>{0, 2, 4},
std::vector<size_t>{0, 2, 2, 4, 4},
std::vector<int>{3, 1, 5, 4},
&ids,
&scores);
paddle::test::GenerateXPUExample(std::vector<size_t>{0, 2, 4},
std::vector<size_t>{0, 1, 2, 3, 4},
std::vector<int>{1, 1, 3, 5},
&ids,
&scores);
paddle::test::GenerateXPUExample(
std::vector<size_t>{0, 2, 4},
std::vector<size_t>{0, 0, 0, 2, 2}, // the branchs of the first source
// sentence are pruned since finished
std::vector<int>{5, 1},
&ids,
&scores);
ASSERT_EQ(ids.size(), 5UL);
ASSERT_EQ(scores.size(), 5UL);
LoDTensor id_tensor_cpu;
LoDTensor score_tensor_cpu;
paddle::operators::BeamSearchDecodeXPUFunctor bs_xpu(
ids, scores, &id_tensor_cpu, &score_tensor_cpu, 2, 1);
bs_xpu.apply_xpu<float>();
LoD lod = id_tensor_cpu.lod();
std::vector<size_t> expect_source_lod = {0, 2, 4};
ASSERT_EQ(lod[0], expect_source_lod);
std::vector<size_t> expect_sentence_lod = {0, 4, 7, 12, 17};
ASSERT_EQ(lod[1], expect_sentence_lod);
std::vector<int> expect_data = {
0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1};
ASSERT_EQ(id_tensor_cpu.dims()[0], static_cast<int64_t>(expect_data.size()));
for (size_t i = 0; i < expect_data.size(); ++i) {
ASSERT_EQ(id_tensor_cpu.data<int64_t>()[i],
static_cast<int64_t>(expect_data[i]));
}
for (int64_t i = 0; i < id_tensor_cpu.dims()[0]; ++i) {
ASSERT_EQ(score_tensor_cpu.data<float>()[i],
static_cast<float>(id_tensor_cpu.data<int64_t>()[i]));
}
}
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/beam_search_op.h"
......@@ -22,3 +24,4 @@ REGISTER_OP_XPU_KERNEL(
ops::BeamSearchOpKernel<paddle::platform::XPUDeviceContext, double>,
ops::BeamSearchOpKernel<paddle::platform::XPUDeviceContext, int>,
ops::BeamSearchOpKernel<paddle::platform::XPUDeviceContext, int64_t>);
#endif
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/beam_search.h"
namespace phi {
......@@ -28,6 +29,28 @@ class XPUDeviceContext;
namespace paddle {
namespace operators {
namespace math {
template <typename T>
int CopyData(const T *x, T **y, int len, const Place &place) {
if (nullptr == x || nullptr == y || len <= 0)
return xpu::Error_t::INVALID_PARAM;
*y = reinterpret_cast<T *>(malloc(sizeof(T) * len));
paddle::memory::Copy(
paddle::platform::CPUPlace(), *y, place, x, len * sizeof(T));
return xpu::Error_t::SUCCESS;
}
template <typename T>
void CopyDataByCondition(const T *x, T **y, int len, const Place &place) {
if (x != nullptr) {
int r = CopyData(x, y, len, place);
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External("Copy data form xpu to cpu failed"));
}
}
template <typename T>
class BeamSearchFunctor<platform::XPUDeviceContext, T> {
......@@ -54,7 +77,8 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
level,
beam_size,
end_id,
is_accumulated);
is_accumulated,
ids->place());
auto selected_items = ToMap(items, high_level.back());
if (FLAGS_v == 3) {
VLOG(3) << "selected_items:";
......@@ -66,7 +90,8 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
}
}
PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
PruneEndBeams(
pre_ids, abs_lod, &selected_items, level, end_id, ids->place());
// calculate the output tensor's height
size_t num_instances = std::accumulate(
std::begin(selected_items),
......@@ -164,8 +189,13 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
const framework::LoD &abs_lod,
std::vector<std::vector<Item>> *items,
size_t lod_level,
int end_id) {
auto *pre_ids_data = pre_ids->data<int64_t>();
int end_id,
const Place &place) {
auto *pre_ids_data_xpu = pre_ids->data<int64_t>();
int64_t *pre_ids_data = nullptr;
CopyDataByCondition<int64_t>(
pre_ids_data_xpu, &pre_ids_data, pre_ids->numel(), place);
auto &high_level = abs_lod[lod_level];
for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
size_t src_prefix_start = high_level[src_idx];
......@@ -189,6 +219,7 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
items->at(offset).clear();
}
}
free(pre_ids_data);
}
/*
......@@ -213,6 +244,7 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
std::vector<Item> &top_beam = *top_beam_ptr;
size_t num_beams = top_beam.size();
if (num_beams < beam_size) {
top_beam.resize(num_beams + 1);
num_beams++;
......@@ -244,17 +276,31 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
size_t lod_level,
size_t beam_size,
int end_id,
bool is_accumulated) {
bool is_accumulated,
const Place &place) {
std::vector<std::vector<Item>> result;
// find the current candidates
auto abs_lod = framework::ToAbsOffset(scores->lod());
auto *pre_ids_data = pre_ids->data<int64_t>();
auto *pre_scores_data = pre_scores->data<float>();
auto *pre_ids_data_xpu = pre_ids->data<int64_t>();
int64_t *pre_ids_data = nullptr;
CopyDataByCondition<int64_t>(
pre_ids_data_xpu, &pre_ids_data, pre_ids->numel(), place);
auto *pre_scores_data_xpu = pre_scores->data<float>();
float *pre_scores_data = nullptr;
CopyDataByCondition<float>(
pre_scores_data_xpu, &pre_scores_data, pre_scores->numel(), place);
auto *ids_data_xpu = ids ? ids->data<int64_t>() : nullptr;
int64_t *ids_data = nullptr;
CopyDataByCondition<int64_t>(ids_data_xpu, &ids_data, ids->numel(), place);
auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
auto *scores_data = scores->data<float>();
auto *scores_data_xpu = scores->data<float>();
float *scores_data = nullptr;
CopyDataByCondition<float>(
scores_data_xpu, &scores_data, scores->numel(), place);
size_t num_seqs = scores->NumElements(lod_level);
size_t seq_width = 1;
......@@ -273,6 +319,7 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
++offset) {
auto pre_id = pre_ids_data[offset];
auto pre_score = pre_scores_data[offset];
if (pre_id == end_id) {
// Allocate all probability mass to end_id for finished branchs and
// the other candidate ids can be ignored.
......@@ -285,6 +332,7 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
float score = is_accumulated
? scores_data[index]
: pre_score + std::log(scores_data[index]);
Item item(offset, id, score);
Insert(&top_beam, item, beam_size);
}
......@@ -304,6 +352,11 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
}
}
free(pre_ids_data);
free(pre_scores_data);
free(ids_data);
free(scores_data);
return result;
}
};
......
......@@ -61,6 +61,12 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::FP64, XPUPlace()),
pOpKernelType(vartype::INT32, XPUPlace()),
pOpKernelType(vartype::INT64, XPUPlace())})},
{"beam_search_decode",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP64, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace()),
pOpKernelType(vartype::INT32, XPUPlace()),
pOpKernelType(vartype::INT64, XPUPlace())})},
{"bilinear_interp_v2",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"bilinear_interp_v2_grad",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册