未验证 提交 2d96801a 编写于 作者: L Leo Guo 提交者: GitHub

Fix the bug of slice op and optimize the code style of generate_proposals_v2...

Fix the bug of slice op and optimize the code style of generate_proposals_v2 op for kunlun. *test=kunlun (#43380)
上级 5e94ef9a
...@@ -16,11 +16,13 @@ limitations under the License. */ ...@@ -16,11 +16,13 @@ limitations under the License. */
#include <paddle/fluid/memory/allocation/allocator.h> #include <paddle/fluid/memory/allocation/allocator.h>
#include <stdio.h> #include <stdio.h>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -91,37 +93,25 @@ static std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -91,37 +93,25 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
index_sort.data<int>(), scores_sel.data<T>(), index_sort.data<int>(), scores_sel.data<T>(),
{static_cast<int>(scores.numel()), 1}, {static_cast<int>(scores.numel()), 1},
index_sort.numel(), 0); index_sort.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::gather<T>(dev_ctx.x_context(), bbox_deltas.data<T>(), r = xpu::gather<T>(dev_ctx.x_context(), bbox_deltas.data<T>(),
index_sort.data<int>(), bbox_sel.data<T>(), index_sort.data<int>(), bbox_sel.data<T>(),
{static_cast<int>(bbox_deltas.numel()) / 4, 4}, {static_cast<int>(bbox_deltas.numel()) / 4, 4},
index_sort.numel(), 0); index_sort.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::gather<T>(dev_ctx.x_context(), anchors.data<T>(), r = xpu::gather<T>(dev_ctx.x_context(), anchors.data<T>(),
index_sort.data<int>(), anchor_sel.data<T>(), index_sort.data<int>(), anchor_sel.data<T>(),
{static_cast<int>(anchors.numel()) / 4, 4}, {static_cast<int>(anchors.numel()) / 4, 4},
index_sort.numel(), 0); index_sort.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::gather<T>(dev_ctx.x_context(), variances.data<T>(), r = xpu::gather<T>(dev_ctx.x_context(), variances.data<T>(),
index_sort.data<int>(), var_sel.data<T>(), index_sort.data<int>(), var_sel.data<T>(),
{static_cast<int>(variances.numel()) / 4, 4}, {static_cast<int>(variances.numel()) / 4, 4},
index_sort.numel(), 0); index_sort.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
int num = scores.numel(); int num = scores.numel();
int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel() int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
...@@ -137,10 +127,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -137,10 +127,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
var_sel.data<T>(), bbox_sel.data<T>(), var_sel.data<T>(), bbox_sel.data<T>(),
proposals.data<T>(), pre_nms_num, !pixel_offset, true, proposals.data<T>(), pre_nms_num, !pixel_offset, true,
im_shape.data<T>()); im_shape.data<T>());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "box_decoder");
platform::errors::External("XPU API(box_decoder) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
// 3. filter // 3. filter
Tensor keep_index, keep_num_t; Tensor keep_index, keep_num_t;
...@@ -151,10 +138,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -151,10 +138,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
im_shape.data<T>(), keep_index.data<int>(), im_shape.data<T>(), keep_index.data<int>(),
keep_num_t.data<int>(), pre_nms_num, min_size, keep_num_t.data<int>(), pre_nms_num, min_size,
false, pixel_offset); false, pixel_offset);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( PADDLE_ENFORCE_XDNN_SUCCESS(r, "remove_small_boxes");
"XPU API(remove_small_boxes) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
int keep_num; int keep_num;
const auto xpu_place = dev_ctx.GetPlace(); const auto xpu_place = dev_ctx.GetPlace();
memory::Copy(platform::CPUPlace(), &keep_num, xpu_place, memory::Copy(platform::CPUPlace(), &keep_num, xpu_place,
...@@ -176,18 +160,12 @@ static std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -176,18 +160,12 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
r = xpu::gather<T>(dev_ctx.x_context(), proposals.data<T>(), r = xpu::gather<T>(dev_ctx.x_context(), proposals.data<T>(),
keep_index.data<int>(), proposals_filter.data<T>(), keep_index.data<int>(), proposals_filter.data<T>(),
{pre_nms_num, 4}, keep_num, 0); {pre_nms_num, 4}, keep_num, 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::gather<T>(dev_ctx.x_context(), scores_sel.data<T>(), r = xpu::gather<T>(dev_ctx.x_context(), scores_sel.data<T>(),
keep_index.data<int>(), scores_filter.data<T>(), keep_index.data<int>(), scores_filter.data<T>(),
{pre_nms_num, 1}, keep_num, 0); {pre_nms_num, 1}, keep_num, 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
if (nms_thresh <= 0) { if (nms_thresh <= 0) {
if (dev_ctx.x_context()->xpu_stream) { if (dev_ctx.x_context()->xpu_stream) {
...@@ -201,10 +179,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -201,10 +179,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
r = xpu::nms<T>(dev_ctx.x_context(), proposals_filter.data<T>(), nullptr, r = xpu::nms<T>(dev_ctx.x_context(), proposals_filter.data<T>(), nullptr,
keep_index.data<int>(), 1, 1, keep_num, -1, nms_thresh, -1, 0, keep_index.data<int>(), 1, 1, keep_num, -1, nms_thresh, -1, 0,
&nms_keep_num, pixel_offset); &nms_keep_num, pixel_offset);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "nms");
platform::errors::External("XPU API(nms) return the"
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
if (post_nms_top_n > 0 && post_nms_top_n < nms_keep_num) { if (post_nms_top_n > 0 && post_nms_top_n < nms_keep_num) {
keep_index.Resize({post_nms_top_n}); keep_index.Resize({post_nms_top_n});
} else { } else {
...@@ -217,17 +192,11 @@ static std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -217,17 +192,11 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
r = xpu::gather<T>(dev_ctx.x_context(), proposals_filter.data<T>(), r = xpu::gather<T>(dev_ctx.x_context(), proposals_filter.data<T>(),
keep_index.data<int>(), proposals_nms.data<T>(), keep_index.data<int>(), proposals_nms.data<T>(),
{keep_num, 4}, keep_index.numel(), 0); {keep_num, 4}, keep_index.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::gather<T>(dev_ctx.x_context(), scores_filter.data<T>(), r = xpu::gather<T>(dev_ctx.x_context(), scores_filter.data<T>(),
keep_index.data<int>(), scores_nms.data<T>(), keep_index.data<int>(), scores_nms.data<T>(),
{keep_num, 1}, keep_index.numel(), 0); {keep_num, 1}, keep_index.numel(), 0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
platform::errors::External("XPU API(gather) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
if (dev_ctx.x_context()->xpu_stream) { if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait(); dev_ctx.Wait();
} }
...@@ -286,17 +255,11 @@ class XPUGenerateProposalsV2Kernel : public framework::OpKernel<T> { ...@@ -286,17 +255,11 @@ class XPUGenerateProposalsV2Kernel : public framework::OpKernel<T> {
int r = xpu::transpose<T>(dev_ctx.x_context(), bbox_deltas->data<T>(), int r = xpu::transpose<T>(dev_ctx.x_context(), bbox_deltas->data<T>(),
bbox_deltas_swap.data<T>(), bbox_deltas_swap.data<T>(),
{num, c_bbox, h_bbox, w_bbox}, axis); {num, c_bbox, h_bbox, w_bbox}, axis);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
platform::errors::External("XPU API(transpose) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::transpose<T>(dev_ctx.x_context(), scores->data<T>(), r = xpu::transpose<T>(dev_ctx.x_context(), scores->data<T>(),
scores_swap.data<T>(), scores_swap.data<T>(),
{num, c_score, h_score, w_score}, axis); {num, c_score, h_score, w_score}, axis);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
platform::errors::External("XPU API(transpose) return "
"wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
anchors.Resize({anchors.numel() / 4, 4}); anchors.Resize({anchors.numel() / 4, 4});
variances.Resize({variances.numel() / 4, 4}); variances.Resize({variances.numel() / 4, 4});
......
...@@ -19,6 +19,8 @@ limitations under the License. */ ...@@ -19,6 +19,8 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/operators/slice_op.h" #include "paddle/fluid/operators/slice_op.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/kernels/funcs/slice_utils.h"
#include "xpu/refactor/math.h" #include "xpu/refactor/math.h"
namespace paddle { namespace paddle {
...@@ -26,76 +28,163 @@ namespace operators { ...@@ -26,76 +28,163 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
inline void DealTensorArray(const framework::ExecutionContext& ctx,
const std::vector<int>& starts,
const std::vector<int>& ends, bool out_is_array) {
auto in_array = ctx.Input<LoDTensorArray>("Input");
// If the input is LoDTensorArray, the rank of input is 1.
int in_size = in_array->size();
int start = starts[0] < 0 ? (starts[0] + in_size) : starts[0];
int end = ends[0] < 0 ? (ends[0] + in_size) : ends[0];
start = std::max(start, static_cast<int>(0));
end = std::max(end, static_cast<int>(0));
end = std::min(end, in_size);
if (starts[0] == -1 && end == 0) {
end = start + 1;
}
PADDLE_ENFORCE_GT(end, start,
platform::errors::InvalidArgument(
"Attr(ends) should be greater than attr(starts) in "
"slice op. But received end = %d, start = %d.",
ends[0], starts[0]));
int out_size = end - start;
if (out_is_array) {
auto out_array = ctx.Output<LoDTensorArray>("Out");
out_array->resize(out_size);
for (int i = 0; i < out_size; ++i) {
auto* out_tensor = &out_array->at(i);
auto in_tensor = in_array->at(i + start);
out_tensor->set_lod(in_tensor.lod());
if (in_tensor.memory_size() > 0) {
paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out_tensor);
} else {
VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array["
<< i << "].";
}
}
} else {
auto out = ctx.Output<Tensor>("Out");
auto in_tensor = in_array->at(start);
paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out);
}
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SliceXPUKernel : public framework::OpKernel<T> { class SliceXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type; using XPUType = typename XPUTypeTrait<T>::Type;
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto in = ctx.Input<framework::Tensor>("Input"); const Variable* input_var = ctx.InputVar("Input");
auto out = ctx.Output<framework::Tensor>("Out"); Variable* out_var = ctx.OutputVar("Out");
auto axes = ctx.Attr<std::vector<int>>("axes"); bool input_is_array = input_var->IsType<LoDTensorArray>();
auto starts = ctx.Attr<std::vector<int>>("starts"); bool out_is_array = out_var->IsType<LoDTensorArray>();
auto ends = ctx.Attr<std::vector<int>>("ends");
auto in_dims = in->dims(); auto axes_int = ctx.Attr<std::vector<int>>("axes");
auto starts_int = ctx.Attr<std::vector<int>>("starts");
// prepare starts, ends on XPU auto ends_int = ctx.Attr<std::vector<int>>("ends");
int dim_value = 0, start = 0, end = 0; std::vector<int> axes(axes_int.begin(), axes_int.end());
// If a negative value is passed for any of the start or end indices, std::vector<int> starts(starts_int.begin(), starts_int.end());
// it represents number of elements before the end of that dimension. std::vector<int> ends(ends_int.begin(), ends_int.end());
// If the value passed to start or end is larger than the n
// (the number of elements in this dimension), it represents n. auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
for (size_t i = 0; i < axes.size(); ++i) { auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
dim_value = in_dims[axes[i]];
start = starts[i]; // Step 1: Get the accurate attribute value of starts and ends
end = ends[i]; auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
start = start < 0 ? (start + dim_value) : start; if (ctx.HasInput("StartsTensor")) {
end = end < 0 ? (end + dim_value) : end; starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
start = std::max(start, 0); } else if (starts_tensor_list.size() > 0) {
end = std::max(end, 0); starts = GetDataFromTensorList<int>(starts_tensor_list);
end = std::min(end, dim_value);
PADDLE_ENFORCE_GT(
end, start,
platform::errors::InvalidArgument("end should greater than start"));
starts[i] = start;
ends[i] = end;
}
size_t shape_size = in_dims.size();
// the slice XPU kernel require that the length of `start`, `end` must be
// equal
// to the dims size of input tensor, therefore, if shape_size > axes.size(),
// the `starts_extension` and `ends_extension` is necessary.
std::vector<int> starts_extension(shape_size, 0);
std::vector<int> ends_extension(shape_size, 0);
if (shape_size > axes.size()) {
for (size_t i = 0; i < shape_size; ++i) {
ends_extension[i] = in_dims[i];
}
for (size_t i = 0; i < axes.size(); ++i) {
starts_extension[axes[i]] = starts[i];
ends_extension[axes[i]] = ends[i];
}
} else {
starts_extension = std::move(starts);
ends_extension = std::move(ends);
} }
// prepare shape on XPU auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
std::vector<int> shape(shape_size, 0); if (ctx.HasInput("EndsTensor")) {
for (size_t i = 0; i < shape_size; ++i) { ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
shape[i] = in_dims[i]; } else if (ends_tensor_list.size() > 0) {
ends = GetDataFromTensorList<int>(ends_tensor_list);
} }
auto& dev_ctx = ctx.template device_context<DeviceContext>();
const XPUType* in_data = reinterpret_cast<const XPUType*>(in->data<T>());
XPUType* out_data =
reinterpret_cast<XPUType*>(out->mutable_data<T>(ctx.GetPlace()));
int r = xpu::slice<XPUType>(dev_ctx.x_context(), in_data, out_data, shape,
starts_extension, ends_extension);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS, starts.size(), axes.size(),
platform::errors::External("XPU slice kernel return wrong value[%d %s]", platform::errors::InvalidArgument(
r, XPUAPIErrorMsg[r])); "The size of starts must be equal to the size of axes."));
PADDLE_ENFORCE_EQ(
ends.size(), axes.size(),
platform::errors::InvalidArgument(
"The size of ends must be equal to the size of axes."));
// Step 2: Compute output
if (input_is_array) {
DealTensorArray(ctx, starts, ends, out_is_array);
return;
} else {
auto in = ctx.Input<framework::Tensor>("Input");
auto out = ctx.Output<framework::Tensor>("Out");
auto in_dims = in->dims();
auto out_dims = out->dims();
auto slice_dims = out_dims;
// 2.1 Infer output dims
for (size_t i = 0; i < axes.size(); ++i) {
// when start == -1 && end == start+1
if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
auto ret =
std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
if (ret != decrease_axis.end()) {
ends[i] = in_dims[axes[i]];
}
}
}
phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
nullptr, nullptr);
out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis);
out->Resize(out_dims);
// 2.2 Get output
size_t shape_size = in_dims.size();
// the slice XPU kernel require that the length of `start`, `end` must be
// equal
// to the dims size of input tensor, therefore, if shape_size >
// axes.size(), the `starts_extension` and `ends_extension` is necessary.
std::vector<int> starts_extension(shape_size, 0);
std::vector<int> ends_extension(shape_size, 0);
if (shape_size > axes.size()) {
for (size_t i = 0; i < shape_size; ++i) {
ends_extension[i] = in_dims[i];
}
for (size_t i = 0; i < axes.size(); ++i) {
starts_extension[axes[i]] = starts[i];
ends_extension[axes[i]] = ends[i];
}
} else {
starts_extension = std::move(starts);
ends_extension = std::move(ends);
}
// prepare shape on XPU
std::vector<int> shape(shape_size, 0);
for (size_t i = 0; i < shape_size; ++i) {
shape[i] = in_dims[i];
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
const XPUType* in_data = reinterpret_cast<const XPUType*>(in->data<T>());
XPUType* out_data =
reinterpret_cast<XPUType*>(out->mutable_data<T>(ctx.GetPlace()));
int r = xpu::slice<XPUType>(dev_ctx.x_context(), in_data, out_data, shape,
starts_extension, ends_extension);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "slice");
}
} }
}; };
...@@ -168,10 +257,7 @@ class SliceGradXPUKernel : public framework::OpKernel<T> { ...@@ -168,10 +257,7 @@ class SliceGradXPUKernel : public framework::OpKernel<T> {
reinterpret_cast<XPUType*>(dinput->mutable_data<T>(ctx.GetPlace())); reinterpret_cast<XPUType*>(dinput->mutable_data<T>(ctx.GetPlace()));
int r = xpu::pad<XPUType>(dev_ctx.x_context(), dout_data, din_data, int r = xpu::pad<XPUType>(dev_ctx.x_context(), dout_data, din_data,
out_dims, pad_left, pad_right, XPUType(0)); out_dims, pad_left, pad_right, XPUType(0));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_XDNN_SUCCESS(r, "pad");
r, XPU_SUCCESS,
platform::errors::External("XPU pad kernel return wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
} }
}; };
} // namespace operators } // namespace operators
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册