未验证 提交 2d96801a 编写于 作者: L Leo Guo 提交者: GitHub

Fix the bug of slice op and optimize the code style of generate_proposals_v2...

Fix the bug of slice op and optimize the code style of generate_proposals_v2 op for kunlun. *test=kunlun (#43380)
上级 5e94ef9a
......@@ -16,11 +16,13 @@ limitations under the License. */
#include <paddle/fluid/memory/allocation/allocator.h>
#include <stdio.h>
#include <string>
#include <vector>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
......@@ -91,37 +93,25 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
index_sort.data<int>(), scores_sel.data<T>(),
{static_cast<int>(scores.numel()), 1},
index_sort.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
r = xpu::gather<T>(dev_ctx.x_context(), bbox_deltas.data<T>(),
index_sort.data<int>(), bbox_sel.data<T>(),
{static_cast<int>(bbox_deltas.numel()) / 4, 4},
index_sort.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
r = xpu::gather<T>(dev_ctx.x_context(), anchors.data<T>(),
index_sort.data<int>(), anchor_sel.data<T>(),
{static_cast<int>(anchors.numel()) / 4, 4},
index_sort.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
r = xpu::gather<T>(dev_ctx.x_context(), variances.data<T>(),
index_sort.data<int>(), var_sel.data<T>(),
{static_cast<int>(variances.numel()) / 4, 4},
index_sort.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
int num = scores.numel();
int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
......@@ -137,10 +127,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
var_sel.data<T>(), bbox_sel.data<T>(),
proposals.data<T>(), pre_nms_num, !pixel_offset, true,
im_shape.data<T>());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(box_decoder) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "box_decoder");
// 3. filter
Tensor keep_index, keep_num_t;
......@@ -151,10 +138,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
im_shape.data<T>(), keep_index.data<int>(),
keep_num_t.data<int>(), pre_nms_num, min_size,
false, pixel_offset);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(remove_small_boxes) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "remove_small_boxes");
int keep_num;
const auto xpu_place = dev_ctx.GetPlace();
memory::Copy(platform::CPUPlace(), &keep_num, xpu_place,
......@@ -176,18 +160,12 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
r = xpu::gather<T>(dev_ctx.x_context(), proposals.data<T>(),
keep_index.data<int>(), proposals_filter.data<T>(),
{pre_nms_num, 4}, keep_num, 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
r = xpu::gather<T>(dev_ctx.x_context(), scores_sel.data<T>(),
keep_index.data<int>(), scores_filter.data<T>(),
{pre_nms_num, 1}, keep_num, 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
if (nms_thresh <= 0) {
if (dev_ctx.x_context()->xpu_stream) {
......@@ -201,10 +179,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
r = xpu::nms<T>(dev_ctx.x_context(), proposals_filter.data<T>(), nullptr,
keep_index.data<int>(), 1, 1, keep_num, -1, nms_thresh, -1, 0,
&nms_keep_num, pixel_offset);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(nms) return the"
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "nms");
if (post_nms_top_n > 0 && post_nms_top_n < nms_keep_num) {
keep_index.Resize({post_nms_top_n});
} else {
......@@ -217,17 +192,11 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
r = xpu::gather<T>(dev_ctx.x_context(), proposals_filter.data<T>(),
keep_index.data<int>(), proposals_nms.data<T>(),
{keep_num, 4}, keep_index.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
r = xpu::gather<T>(dev_ctx.x_context(), scores_filter.data<T>(),
keep_index.data<int>(), scores_nms.data<T>(),
{keep_num, 1}, keep_index.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}
......@@ -286,17 +255,11 @@ class XPUGenerateProposalsV2Kernel : public framework::OpKernel<T> {
int r = xpu::transpose<T>(dev_ctx.x_context(), bbox_deltas->data<T>(),
bbox_deltas_swap.data<T>(),
{num, c_bbox, h_bbox, w_bbox}, axis);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(transpose) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
r = xpu::transpose<T>(dev_ctx.x_context(), scores->data<T>(),
scores_swap.data<T>(),
{num, c_score, h_score, w_score}, axis);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(transpose) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
anchors.Resize({anchors.numel() / 4, 4});
variances.Resize({variances.numel() / 4, 4});
......
......@@ -19,6 +19,8 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/operators/slice_op.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/kernels/funcs/slice_utils.h"
#include "xpu/refactor/math.h"
namespace paddle {
......@@ -26,76 +28,163 @@ namespace operators {
using Tensor = framework::Tensor;
inline void DealTensorArray(const framework::ExecutionContext& ctx,
const std::vector<int>& starts,
const std::vector<int>& ends, bool out_is_array) {
auto in_array = ctx.Input<LoDTensorArray>("Input");
// If the input is LoDTensorArray, the rank of input is 1.
int in_size = in_array->size();
int start = starts[0] < 0 ? (starts[0] + in_size) : starts[0];
int end = ends[0] < 0 ? (ends[0] + in_size) : ends[0];
start = std::max(start, static_cast<int>(0));
end = std::max(end, static_cast<int>(0));
end = std::min(end, in_size);
if (starts[0] == -1 && end == 0) {
end = start + 1;
}
PADDLE_ENFORCE_GT(end, start,
platform::errors::InvalidArgument(
"Attr(ends) should be greater than attr(starts) in "
"slice op. But received end = %d, start = %d.",
ends[0], starts[0]));
int out_size = end - start;
if (out_is_array) {
auto out_array = ctx.Output<LoDTensorArray>("Out");
out_array->resize(out_size);
for (int i = 0; i < out_size; ++i) {
auto* out_tensor = &out_array->at(i);
auto in_tensor = in_array->at(i + start);
out_tensor->set_lod(in_tensor.lod());
if (in_tensor.memory_size() > 0) {
paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out_tensor);
} else {
VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array["
<< i << "].";
}
}
} else {
auto out = ctx.Output<Tensor>("Out");
auto in_tensor = in_array->at(start);
paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out);
}
}
template <typename DeviceContext, typename T>
class SliceXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto in = ctx.Input<framework::Tensor>("Input");
auto out = ctx.Output<framework::Tensor>("Out");
auto axes = ctx.Attr<std::vector<int>>("axes");
auto starts = ctx.Attr<std::vector<int>>("starts");
auto ends = ctx.Attr<std::vector<int>>("ends");
auto in_dims = in->dims();
// prepare starts, ends on XPU
int dim_value = 0, start = 0, end = 0;
// If a negative value is passed for any of the start or end indices,
// it represents number of elements before the end of that dimension.
// If the value passed to start or end is larger than the n
// (the number of elements in this dimension), it represents n.
for (size_t i = 0; i < axes.size(); ++i) {
dim_value = in_dims[axes[i]];
start = starts[i];
end = ends[i];
start = start < 0 ? (start + dim_value) : start;
end = end < 0 ? (end + dim_value) : end;
start = std::max(start, 0);
end = std::max(end, 0);
end = std::min(end, dim_value);
PADDLE_ENFORCE_GT(
end, start,
platform::errors::InvalidArgument("end should greater than start"));
starts[i] = start;
ends[i] = end;
}
size_t shape_size = in_dims.size();
// the slice XPU kernel require that the length of `start`, `end` must be
// equal
// to the dims size of input tensor, therefore, if shape_size > axes.size(),
// the `starts_extension` and `ends_extension` is necessary.
std::vector<int> starts_extension(shape_size, 0);
std::vector<int> ends_extension(shape_size, 0);
if (shape_size > axes.size()) {
for (size_t i = 0; i < shape_size; ++i) {
ends_extension[i] = in_dims[i];
}
for (size_t i = 0; i < axes.size(); ++i) {
starts_extension[axes[i]] = starts[i];
ends_extension[axes[i]] = ends[i];
}
} else {
starts_extension = std::move(starts);
ends_extension = std::move(ends);
const Variable* input_var = ctx.InputVar("Input");
Variable* out_var = ctx.OutputVar("Out");
bool input_is_array = input_var->IsType<LoDTensorArray>();
bool out_is_array = out_var->IsType<LoDTensorArray>();
auto axes_int = ctx.Attr<std::vector<int>>("axes");
auto starts_int = ctx.Attr<std::vector<int>>("starts");
auto ends_int = ctx.Attr<std::vector<int>>("ends");
std::vector<int> axes(axes_int.begin(), axes_int.end());
std::vector<int> starts(starts_int.begin(), starts_int.end());
std::vector<int> ends(ends_int.begin(), ends_int.end());
auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
// Step 1: Get the accurate attribute value of starts and ends
auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
if (ctx.HasInput("StartsTensor")) {
starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
} else if (starts_tensor_list.size() > 0) {
starts = GetDataFromTensorList<int>(starts_tensor_list);
}
// prepare shape on XPU
std::vector<int> shape(shape_size, 0);
for (size_t i = 0; i < shape_size; ++i) {
shape[i] = in_dims[i];
auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
if (ctx.HasInput("EndsTensor")) {
ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
} else if (ends_tensor_list.size() > 0) {
ends = GetDataFromTensorList<int>(ends_tensor_list);
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
const XPUType* in_data = reinterpret_cast<const XPUType*>(in->data<T>());
XPUType* out_data =
reinterpret_cast<XPUType*>(out->mutable_data<T>(ctx.GetPlace()));
int r = xpu::slice<XPUType>(dev_ctx.x_context(), in_data, out_data, shape,
starts_extension, ends_extension);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU slice kernel return wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
starts.size(), axes.size(),
platform::errors::InvalidArgument(
"The size of starts must be equal to the size of axes."));
PADDLE_ENFORCE_EQ(
ends.size(), axes.size(),
platform::errors::InvalidArgument(
"The size of ends must be equal to the size of axes."));
// Step 2: Compute output
if (input_is_array) {
DealTensorArray(ctx, starts, ends, out_is_array);
return;
} else {
auto in = ctx.Input<framework::Tensor>("Input");
auto out = ctx.Output<framework::Tensor>("Out");
auto in_dims = in->dims();
auto out_dims = out->dims();
auto slice_dims = out_dims;
// 2.1 Infer output dims
for (size_t i = 0; i < axes.size(); ++i) {
// when start == -1 && end == start+1
if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
auto ret =
std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
if (ret != decrease_axis.end()) {
ends[i] = in_dims[axes[i]];
}
}
}
phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
nullptr, nullptr);
out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis);
out->Resize(out_dims);
// 2.2 Get output
size_t shape_size = in_dims.size();
// the slice XPU kernel require that the length of `start`, `end` must be
// equal
// to the dims size of input tensor, therefore, if shape_size >
// axes.size(), the `starts_extension` and `ends_extension` is necessary.
std::vector<int> starts_extension(shape_size, 0);
std::vector<int> ends_extension(shape_size, 0);
if (shape_size > axes.size()) {
for (size_t i = 0; i < shape_size; ++i) {
ends_extension[i] = in_dims[i];
}
for (size_t i = 0; i < axes.size(); ++i) {
starts_extension[axes[i]] = starts[i];
ends_extension[axes[i]] = ends[i];
}
} else {
starts_extension = std::move(starts);
ends_extension = std::move(ends);
}
// prepare shape on XPU
std::vector<int> shape(shape_size, 0);
for (size_t i = 0; i < shape_size; ++i) {
shape[i] = in_dims[i];
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
const XPUType* in_data = reinterpret_cast<const XPUType*>(in->data<T>());
XPUType* out_data =
reinterpret_cast<XPUType*>(out->mutable_data<T>(ctx.GetPlace()));
int r = xpu::slice<XPUType>(dev_ctx.x_context(), in_data, out_data, shape,
starts_extension, ends_extension);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "slice");
}
}
};
......@@ -168,10 +257,7 @@ class SliceGradXPUKernel : public framework::OpKernel<T> {
reinterpret_cast<XPUType*>(dinput->mutable_data<T>(ctx.GetPlace()));
int r = xpu::pad<XPUType>(dev_ctx.x_context(), dout_data, din_data,
out_dims, pad_left, pad_right, XPUType(0));
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU pad kernel return wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "pad");
}
};
} // namespace operators
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册