From 2d96801a0d6968d138fc7690b42517ab72a63bfe Mon Sep 17 00:00:00 2001 From: Leo Guo <58431564+ZibinGuo@users.noreply.github.com> Date: Sun, 12 Jun 2022 19:59:21 +0800 Subject: [PATCH] Fix the bug of slice op and optimize the code style of generate_proposals_v2 op for kunlun. *test=kunlun (#43380) --- .../detection/generate_proposals_v2_op_xpu.cc | 69 ++---- paddle/fluid/operators/slice_op_xpu.cc | 212 ++++++++++++------ 2 files changed, 165 insertions(+), 116 deletions(-) diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc index 28c94668ba7..48a592f2b54 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc @@ -16,11 +16,13 @@ limitations under the License. */ #include #include + #include #include -#include "paddle/fluid/framework/mixed_vector.h" + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -91,37 +93,25 @@ static std::pair ProposalForOneImage( index_sort.data(), scores_sel.data(), {static_cast(scores.numel()), 1}, index_sort.numel(), 0); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(gather) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); r = xpu::gather(dev_ctx.x_context(), bbox_deltas.data(), index_sort.data(), bbox_sel.data(), {static_cast(bbox_deltas.numel()) / 4, 4}, index_sort.numel(), 0); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(gather) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); r = xpu::gather(dev_ctx.x_context(), anchors.data(), index_sort.data(), anchor_sel.data(), {static_cast(anchors.numel()) / 4, 4}, index_sort.numel(), 0); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(gather) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); r = xpu::gather(dev_ctx.x_context(), variances.data(), index_sort.data(), var_sel.data(), {static_cast(variances.numel()) / 4, 4}, index_sort.numel(), 0); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(gather) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); int num = scores.numel(); int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel() @@ -137,10 +127,7 @@ static std::pair ProposalForOneImage( var_sel.data(), bbox_sel.data(), proposals.data(), pre_nms_num, !pixel_offset, true, im_shape.data()); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(box_decoder) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "box_decoder"); // 3. filter Tensor keep_index, keep_num_t; @@ -151,10 +138,7 @@ static std::pair ProposalForOneImage( im_shape.data(), keep_index.data(), keep_num_t.data(), pre_nms_num, min_size, false, pixel_offset); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU API(remove_small_boxes) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "remove_small_boxes"); int keep_num; const auto xpu_place = dev_ctx.GetPlace(); memory::Copy(platform::CPUPlace(), &keep_num, xpu_place, @@ -176,18 +160,12 @@ static std::pair ProposalForOneImage( r = xpu::gather(dev_ctx.x_context(), proposals.data(), keep_index.data(), proposals_filter.data(), {pre_nms_num, 4}, keep_num, 0); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(gather) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); r = xpu::gather(dev_ctx.x_context(), scores_sel.data(), keep_index.data(), scores_filter.data(), {pre_nms_num, 1}, keep_num, 0); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(gather) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); if (nms_thresh <= 0) { if (dev_ctx.x_context()->xpu_stream) { @@ -201,10 +179,7 @@ static std::pair ProposalForOneImage( r = xpu::nms(dev_ctx.x_context(), proposals_filter.data(), nullptr, keep_index.data(), 1, 1, keep_num, -1, nms_thresh, -1, 0, &nms_keep_num, pixel_offset); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(nms) return the" - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "nms"); if (post_nms_top_n > 0 && post_nms_top_n < nms_keep_num) { keep_index.Resize({post_nms_top_n}); } else { @@ -217,17 +192,11 @@ static std::pair ProposalForOneImage( r = xpu::gather(dev_ctx.x_context(), proposals_filter.data(), keep_index.data(), proposals_nms.data(), {keep_num, 4}, keep_index.numel(), 0); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(gather) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); r = xpu::gather(dev_ctx.x_context(), scores_filter.data(), keep_index.data(), scores_nms.data(), {keep_num, 1}, keep_index.numel(), 0); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(gather) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); if (dev_ctx.x_context()->xpu_stream) { dev_ctx.Wait(); } @@ -286,17 +255,11 @@ class XPUGenerateProposalsV2Kernel : public framework::OpKernel { int r = xpu::transpose(dev_ctx.x_context(), bbox_deltas->data(), bbox_deltas_swap.data(), {num, c_bbox, h_bbox, w_bbox}, axis); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(transpose) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); r = xpu::transpose(dev_ctx.x_context(), scores->data(), scores_swap.data(), {num, c_score, h_score, w_score}, axis); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU API(transpose) return " - "wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); anchors.Resize({anchors.numel() / 4, 4}); variances.Resize({variances.numel() / 4, 4}); diff --git a/paddle/fluid/operators/slice_op_xpu.cc b/paddle/fluid/operators/slice_op_xpu.cc index 8f2dfd38d49..10179e80434 100644 --- a/paddle/fluid/operators/slice_op_xpu.cc +++ b/paddle/fluid/operators/slice_op_xpu.cc @@ -19,6 +19,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/slice_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/phi/kernels/funcs/slice_utils.h" #include "xpu/refactor/math.h" namespace paddle { @@ -26,76 +28,163 @@ namespace operators { using Tensor = framework::Tensor; +inline void DealTensorArray(const framework::ExecutionContext& ctx, + const std::vector& starts, + const std::vector& ends, bool out_is_array) { + auto in_array = ctx.Input("Input"); + // If the input is LoDTensorArray, the rank of input is 1. + int in_size = in_array->size(); + int start = starts[0] < 0 ? (starts[0] + in_size) : starts[0]; + int end = ends[0] < 0 ? (ends[0] + in_size) : ends[0]; + + start = std::max(start, static_cast(0)); + end = std::max(end, static_cast(0)); + end = std::min(end, in_size); + + if (starts[0] == -1 && end == 0) { + end = start + 1; + } + + PADDLE_ENFORCE_GT(end, start, + platform::errors::InvalidArgument( + "Attr(ends) should be greater than attr(starts) in " + "slice op. But received end = %d, start = %d.", + ends[0], starts[0])); + int out_size = end - start; + + if (out_is_array) { + auto out_array = ctx.Output("Out"); + out_array->resize(out_size); + + for (int i = 0; i < out_size; ++i) { + auto* out_tensor = &out_array->at(i); + auto in_tensor = in_array->at(i + start); + out_tensor->set_lod(in_tensor.lod()); + if (in_tensor.memory_size() > 0) { + paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out_tensor); + } else { + VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " + "nothing has been written to output array[" + << i << "]."; + } + } + } else { + auto out = ctx.Output("Out"); + auto in_tensor = in_array->at(start); + paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out); + } +} template class SliceXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; public: void Compute(const framework::ExecutionContext& ctx) const override { - auto in = ctx.Input("Input"); - auto out = ctx.Output("Out"); - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); - auto in_dims = in->dims(); - - // prepare starts, ends on XPU - int dim_value = 0, start = 0, end = 0; - // If a negative value is passed for any of the start or end indices, - // it represents number of elements before the end of that dimension. - // If the value passed to start or end is larger than the n - // (the number of elements in this dimension), it represents n. - for (size_t i = 0; i < axes.size(); ++i) { - dim_value = in_dims[axes[i]]; - start = starts[i]; - end = ends[i]; - start = start < 0 ? (start + dim_value) : start; - end = end < 0 ? (end + dim_value) : end; - start = std::max(start, 0); - end = std::max(end, 0); - end = std::min(end, dim_value); - PADDLE_ENFORCE_GT( - end, start, - platform::errors::InvalidArgument("end should greater than start")); - starts[i] = start; - ends[i] = end; - } - size_t shape_size = in_dims.size(); - // the slice XPU kernel require that the length of `start`, `end` must be - // equal - // to the dims size of input tensor, therefore, if shape_size > axes.size(), - // the `starts_extension` and `ends_extension` is necessary. - std::vector starts_extension(shape_size, 0); - std::vector ends_extension(shape_size, 0); - if (shape_size > axes.size()) { - for (size_t i = 0; i < shape_size; ++i) { - ends_extension[i] = in_dims[i]; - } - for (size_t i = 0; i < axes.size(); ++i) { - starts_extension[axes[i]] = starts[i]; - ends_extension[axes[i]] = ends[i]; - } - } else { - starts_extension = std::move(starts); - ends_extension = std::move(ends); + const Variable* input_var = ctx.InputVar("Input"); + Variable* out_var = ctx.OutputVar("Out"); + bool input_is_array = input_var->IsType(); + bool out_is_array = out_var->IsType(); + + auto axes_int = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); + std::vector axes(axes_int.begin(), axes_int.end()); + std::vector starts(starts_int.begin(), starts_int.end()); + std::vector ends(ends_int.begin(), ends_int.end()); + + auto decrease_axis = ctx.Attr>("decrease_axis"); + auto infer_flags = ctx.Attr>("infer_flags"); + + // Step 1: Get the accurate attribute value of starts and ends + auto starts_tensor_list = ctx.MultiInput("StartsTensorList"); + if (ctx.HasInput("StartsTensor")) { + starts = GetDataFromTensor(ctx.Input("StartsTensor")); + } else if (starts_tensor_list.size() > 0) { + starts = GetDataFromTensorList(starts_tensor_list); } - // prepare shape on XPU - std::vector shape(shape_size, 0); - for (size_t i = 0; i < shape_size; ++i) { - shape[i] = in_dims[i]; + auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); + if (ctx.HasInput("EndsTensor")) { + ends = GetDataFromTensor(ctx.Input("EndsTensor")); + } else if (ends_tensor_list.size() > 0) { + ends = GetDataFromTensorList(ends_tensor_list); } - auto& dev_ctx = ctx.template device_context(); - const XPUType* in_data = reinterpret_cast(in->data()); - XPUType* out_data = - reinterpret_cast(out->mutable_data(ctx.GetPlace())); - int r = xpu::slice(dev_ctx.x_context(), in_data, out_data, shape, - starts_extension, ends_extension); PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU slice kernel return wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + starts.size(), axes.size(), + platform::errors::InvalidArgument( + "The size of starts must be equal to the size of axes.")); + PADDLE_ENFORCE_EQ( + ends.size(), axes.size(), + platform::errors::InvalidArgument( + "The size of ends must be equal to the size of axes.")); + + // Step 2: Compute output + if (input_is_array) { + DealTensorArray(ctx, starts, ends, out_is_array); + return; + } else { + auto in = ctx.Input("Input"); + auto out = ctx.Output("Out"); + + auto in_dims = in->dims(); + auto out_dims = out->dims(); + auto slice_dims = out_dims; + + // 2.1 Infer output dims + for (size_t i = 0; i < axes.size(); ++i) { + // when start == -1 && end == start+1 + if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) { + auto ret = + std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]); + if (ret != decrease_axis.end()) { + ends[i] = in_dims[axes[i]]; + } + } + } + + phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends); + slice_dims = phi::funcs::GetSliceDims(in_dims, axes, starts, ends, + nullptr, nullptr); + out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis); + + out->Resize(out_dims); + + // 2.2 Get output + size_t shape_size = in_dims.size(); + // the slice XPU kernel require that the length of `start`, `end` must be + // equal + // to the dims size of input tensor, therefore, if shape_size > + // axes.size(), the `starts_extension` and `ends_extension` is necessary. + std::vector starts_extension(shape_size, 0); + std::vector ends_extension(shape_size, 0); + if (shape_size > axes.size()) { + for (size_t i = 0; i < shape_size; ++i) { + ends_extension[i] = in_dims[i]; + } + for (size_t i = 0; i < axes.size(); ++i) { + starts_extension[axes[i]] = starts[i]; + ends_extension[axes[i]] = ends[i]; + } + } else { + starts_extension = std::move(starts); + ends_extension = std::move(ends); + } + + // prepare shape on XPU + std::vector shape(shape_size, 0); + for (size_t i = 0; i < shape_size; ++i) { + shape[i] = in_dims[i]; + } + + auto& dev_ctx = ctx.template device_context(); + const XPUType* in_data = reinterpret_cast(in->data()); + XPUType* out_data = + reinterpret_cast(out->mutable_data(ctx.GetPlace())); + int r = xpu::slice(dev_ctx.x_context(), in_data, out_data, shape, + starts_extension, ends_extension); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "slice"); + } } }; @@ -168,10 +257,7 @@ class SliceGradXPUKernel : public framework::OpKernel { reinterpret_cast(dinput->mutable_data(ctx.GetPlace())); int r = xpu::pad(dev_ctx.x_context(), dout_data, din_data, out_dims, pad_left, pad_right, XPUType(0)); - PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU pad kernel return wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "pad"); } }; } // namespace operators -- GitLab