未验证 提交 fc44087d 编写于 作者: W whs 提交者: GitHub

Add roi perspective transform op. (#13176)

* Add roi perspective transform.

* Add roi_perspective_transform_op.

* Fix code style.

* Add python api and fix doc.

* Fix API.spec

* Fix python api.

* Fix API.spec

* Move src to detection.
上级 f66d08c2
......@@ -296,6 +296,7 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
......
......@@ -31,5 +31,6 @@ polygon_box_transform_op.cu)
detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
#Export local libraries to parent
set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
static constexpr int kROISize = 4;
template <typename T>
bool GT_E(T a, T b) {
return (a > b) || fabs(a - b) < 1e-4;
}
template <typename T>
bool LT_E(T a, T b) {
return (a < b) || fabs(a - b) < 1e-4;
}
template <typename T>
bool GT(T a, T b) {
return (a - b) > 1e-4;
}
/*
*check if (x, y) is in the boundary of roi
*/
template <typename T>
bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
for (int i = 0; i < 4; i++) {
T xs = roi_x[i];
T ys = roi_y[i];
T xe = roi_x[(i + 1) % 4];
T ye = roi_y[(i + 1) % 4];
if (fabs(ys - ye) < 1e-4) {
if (fabs(y - ys) < 1e-4 && fabs(y - ye) < 1e-4 &&
GT_E<T>(x, std::min(xs, xe)) && LT_E<T>(x, std::max(xs, xe))) {
return true;
}
} else {
T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
if (fabs(intersec_x - x) < 1e-4 && GT_E<T>(y, std::min(ys, ye)) &&
LT_E<T>(y, std::max(ys, ye))) {
return true;
}
}
}
int n_cross = 0;
for (int i = 0; i < 4; i++) {
T xs = roi_x[i];
T ys = roi_y[i];
T xe = roi_x[(i + 1) % 4];
T ye = roi_y[(i + 1) % 4];
if (fabs(ys - ye) < 1e-4) {
continue;
}
if (LT_E<T>(y, std::min(ys, ye)) || GT<T>(y, std::max(ys, ye))) {
continue;
}
T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
if (fabs(intersec_x - x) < 1e-4) {
return true;
}
if (GT<T>(intersec_x, x)) {
n_cross++;
}
}
return (n_cross % 2 == 1);
}
/**
* Get the matrix of perspective transform.
*
* dx1 = x1 - x2
* dx2 = x3 - x2
* dx3 = x0 - x1 + x2 - x3
* dy1 = y1 - y2
* dy2 = y3 - y2
* dy3 = y0 - y1 + y2 - y3
*
* a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1)
* a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1)
* a13 = x0
* a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1)
* a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1)
* a23 = y0
* a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
* a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
* a33 = 1
*
*/
template <typename T>
void get_transform_matrix(const int transformed_width,
const int transformed_height, T roi_x[], T roi_y[],
T matrix[]) {
T x0 = roi_x[0];
T x1 = roi_x[1];
T x2 = roi_x[2];
T x3 = roi_x[3];
T y0 = roi_y[0];
T y1 = roi_y[1];
T y2 = roi_y[2];
T y3 = roi_y[3];
// Estimate the height and width of RoI
T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
T estimated_height = (len2 + len4) / 2.0;
T estimated_width = (len1 + len3) / 2.0;
// Get the normalized height and normalized width
int normalized_height = transformed_height;
int normalized_width =
std::round(estimated_width * (normalized_height - 1) / estimated_height) +
1;
normalized_width = std::min(normalized_width, transformed_width);
T dx1 = x1 - x2;
T dx2 = x3 - x2;
T dx3 = x0 - x1 + x2 - x3;
T dy1 = y1 - y2;
T dy2 = y3 - y2;
T dy3 = y0 - y1 + y2 - y3;
matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) /
(normalized_width - 1);
matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) /
(normalized_height - 1);
matrix[8] = 1;
matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
(normalized_width - 1);
matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
(normalized_height - 1);
matrix[5] = y0;
matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
(normalized_width - 1);
matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
(normalized_height - 1);
matrix[2] = x0;
}
/**
* Get the source coordinates in the input feature map.
*
* (u, v, w)^matrix = matrix * (out_w, out_h, 1)^matrix
*
* in_w = u / w
* in_h = v / w
*
*/
template <typename T>
void get_source_coords(T matrix[], int out_w, int out_h, T* in_w, T* in_h) {
T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
in_w[0] = u / w;
in_h[0] = v / w;
}
/**
* Perform bilinear interpolation in the input feature map.
*/
template <typename T>
void bilinear_interpolate(const T* in_data, const int channels, const int width,
const int height, int in_n, int in_c, T in_w, T in_h,
T* val) {
// Deal with cases that source coords are out of feature map boundary
if (GT<T>(-0.5, in_w) || GT<T>(in_w, width - 0.5) || GT<T>(-0.5, in_h) ||
GT<T>(in_h, height - 0.5)) {
// empty
val[0] = 0.0;
return;
}
if (GT<T>(0, in_w)) {
in_w = 0;
}
if (GT<T>(0, in_h)) {
in_h = 0;
}
int in_w_floor = floor(in_w);
int in_h_floor = floor(in_h);
int in_w_ceil;
int in_h_ceil;
if (GT_E<T>(in_w_floor, width - 1)) {
in_w_ceil = in_w_floor = width - 1;
in_w = static_cast<T>(in_w_floor);
} else {
in_w_ceil = in_w_floor + 1;
}
if (GT_E<T>(in_h_floor, height - 1)) {
in_h_ceil = in_h_floor = height - 1;
in_h = static_cast<T>(in_h_floor);
} else {
in_h_ceil = in_h_floor + 1;
}
T w_floor = in_w - in_w_floor;
T h_floor = in_h - in_h_floor;
T w_ceil = 1 - w_floor;
T h_ceil = 1 - h_floor;
const T* data = in_data + (in_n * channels + in_c) * height * width;
// Do bilinear interpolation
T v1 = data[in_h_floor * width + in_w_floor];
T v2 = data[in_h_ceil * width + in_w_floor];
T v3 = data[in_h_ceil * width + in_w_ceil];
T v4 = data[in_h_floor * width + in_w_ceil];
T w1 = w_ceil * h_ceil;
T w2 = w_ceil * h_floor;
T w3 = w_floor * h_floor;
T w4 = w_floor * h_ceil;
val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
}
template <typename T>
class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out = ctx.Output<framework::Tensor>("Out");
auto transformed_height = ctx.Attr<int>("transformed_height");
auto transformed_width = ctx.Attr<int>("transformed_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto in_dims = in->dims();
int channels = in_dims[1];
int in_height = in_dims[2];
int in_width = in_dims[3];
int rois_num = rois->dims()[0];
const T* input_data = in->data<T>();
framework::Tensor roi2image;
roi2image.Resize({rois_num});
int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
auto lod = rois->lod().back();
for (int i = 0; i < lod.size() - 1; ++i) {
for (int j = lod[i]; j < lod[i + 1]; ++j) {
roi2image_data[j] = i;
}
}
T* output_data = out->mutable_data<T>(ctx.GetPlace());
const T* rois_data = rois->data<T>();
for (int n = 0; n < rois_num; ++n) {
const T* n_rois = rois_data + n * 8;
T roi_x[4];
T roi_y[4];
for (int k = 0; k < 4; ++k) {
roi_x[k] = n_rois[2 * k] * spatial_scale;
roi_y[k] = n_rois[2 * k + 1] * spatial_scale;
}
int image_id = roi2image_data[n];
// Get transform matrix
T transform_matrix[9];
get_transform_matrix<T>(transformed_width, transformed_height, roi_x,
roi_y, transform_matrix);
for (int c = 0; c < channels; ++c) {
for (int out_h = 0; out_h < transformed_height; ++out_h) {
for (int out_w = 0; out_w < transformed_width; ++out_w) {
int out_index =
n * channels * transformed_height * transformed_width +
c * transformed_height * transformed_width +
out_h * transformed_width + out_w;
T in_w, in_h;
get_source_coords<T>(transform_matrix, out_w, out_h, &in_w, &in_h);
if (in_quad<T>(in_w, in_h, roi_x, roi_y)) {
if (GT<T>(-0.5, in_w) ||
GT<T>(in_w, static_cast<T>(in_width - 0.5)) ||
GT<T>(-0.5, in_h) ||
GT<T>(in_h, static_cast<T>(in_height - 0.5))) {
output_data[out_index] = 0.0;
} else {
bilinear_interpolate(input_data, channels, in_width, in_height,
image_id, c, in_w, in_h,
output_data + out_index);
}
} else {
output_data[out_index] = 0.0;
}
}
}
}
}
}
};
template <typename T>
T get_feature_gradient(T xs, T ys, int w, int h, const int width,
const int height) {
if (GT<T>(-0.5, xs) || GT<T>(xs, width - 0.5) || GT<T>(-0.5, ys) ||
GT<T>(ys, height - 0.5)) {
return 0;
}
if (GT<T>(0, xs)) {
xs = 0;
}
if (GT<T>(0, ys)) {
ys = 0;
}
int xs_floor = floor(xs);
int ys_floor = floor(ys);
int xs_ceil;
int ys_ceil;
if (GT_E(xs_floor, width - 1)) {
xs_ceil = xs_floor = width - 1;
xs = static_cast<T>(xs_floor);
} else {
xs_ceil = xs_floor + 1;
}
if (GT_E(ys_floor, height - 1)) {
ys_ceil = ys_floor = height - 1;
ys = static_cast<T>(ys_floor);
} else {
ys_ceil = ys_floor + 1;
}
T weight = 0;
if (w == xs_floor) {
if (h == ys_floor) {
weight = (w + 1 - xs) * (h + 1 - ys);
} else if (h == ys_ceil) {
weight = (w + 1 - xs) * (ys + 1 - h);
}
} else if (w == xs_ceil) {
if (h == ys_floor) {
weight = (xs + 1 - w) * (h + 1 - ys);
} else if (h == ys_ceil) {
weight = (xs + 1 - w) * (ys + 1 - h);
}
}
return weight;
}
template <typename T>
class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto transformed_height = ctx.Attr<int>("transformed_height");
auto transformed_width = ctx.Attr<int>("transformed_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto in_dims = in->dims();
int batch_size = in_dims[0];
int channels = in_dims[1];
int in_height = in_dims[2];
int in_width = in_dims[3];
int rois_num = rois->dims()[0];
T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
const T* out_grad_data = out_grad->data<T>();
const T* rois_data = rois->data<T>();
framework::Tensor roi2image;
roi2image.Resize({rois_num});
int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
auto lod = rois->lod().back();
for (int i = 0; i < lod.size() - 1; ++i) {
for (int j = lod[i]; j < lod[i + 1]; ++j) {
roi2image_data[j] = i;
}
}
for (int n = 0; n < batch_size; ++n) {
for (int c = 0; c < channels; ++c) {
for (int in_h = 0; in_h < in_height; ++in_h) {
for (int in_w = 0; in_w < in_width; ++in_w) {
T gradient = 0.0;
for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) {
const T* rois = rois_data + roi_idx * 8;
T roi_x[4];
T roi_y[4];
for (int k = 0; k < 4; ++k) {
roi_x[k] = rois[2 * k] * spatial_scale;
roi_y[k] = rois[2 * k + 1] * spatial_scale;
}
// Get transform matrix
T matrix[9];
get_transform_matrix<T>(transformed_width, transformed_height,
roi_x, roi_y, matrix);
const T* out_grad_ptr = out_grad_data +
(roi_idx * channels + c) *
transformed_height *
transformed_width;
for (int out_h = 0; out_h < transformed_height; ++out_h) {
for (int out_w = 0; out_w < transformed_width; ++out_w) {
T src_w;
T src_h;
get_source_coords<T>(matrix, out_w, out_h, &src_w, &src_h);
if (in_quad<T>(src_w, src_h, roi_x, roi_y)) {
if (GT<T>(-0.5, src_w) ||
GT<T>(src_w, static_cast<T>(in_width - 0.5)) ||
GT<T>(-0.5, src_h) ||
GT<T>(src_h, static_cast<T>(in_height - 0.5))) {
continue;
}
T weight = get_feature_gradient<T>(src_w, src_h, in_w, in_h,
in_width, in_height);
gradient +=
out_grad_ptr[out_h * transformed_width + out_w] *
weight;
}
}
}
}
int out_idx = (n * channels + c) * in_height * in_width +
in_h * in_width + in_w;
in_grad_data[out_idx] = gradient;
}
}
}
}
}
};
class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ROIPerspectiveTransformOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("ROIs"),
"Input(ROIs) of ROIPerspectiveTransformOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of ROIPerspectiveTransformOp should not be null.");
auto input_dims = ctx->GetInputDim("X");
auto rois_dims = ctx->GetInputDim("ROIs");
PADDLE_ENFORCE(input_dims.size() == 4,
"The format of input tensor is NCHW.");
PADDLE_ENFORCE(rois_dims.size() == 2,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 8)"
"given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...]");
PADDLE_ENFORCE(rois_dims[1] == 8,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 8)"
"given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...].");
int transformed_height = ctx->Attrs().Get<int>("transformed_height");
int transformed_width = ctx->Attrs().Get<int>("transformed_width");
float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
PADDLE_ENFORCE_GT(transformed_height, 0,
"The transformed output height must greater than 0");
PADDLE_ENFORCE_GT(transformed_width, 0,
"The transformed output width must greater than 0");
PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
"The spatial scale must greater than 0");
std::vector<int64_t> out_dims_v({rois_dims[0], // num_rois
input_dims[1], // channels
static_cast<int64_t>(transformed_height),
static_cast<int64_t>(transformed_width)});
auto out_dims = framework::make_ddim(out_dims_v);
ctx->SetOutputDim("Out", out_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.device_context());
}
};
class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"The gradient of Out should not be null.");
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
"The gradient of X should not be null.");
ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.device_context());
}
};
class ROIPerspectiveTransformOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor), "
"the input of ROIPerspectiveTransformOp. "
"The format of input tensor is NCHW. Where N is batch size, "
"C is the number of input channels, "
"H is the height of the feature, and "
"W is the width of the feature.");
AddInput("ROIs",
"(LoDTensor), "
"ROIs (Regions of Interest) to be transformed. "
"should be a 2-D LoDTensor of shape (num_rois, 8)"
"given as [[x1, y1, x2, y2, x3, y3, x4, y4], ...]."
"(x1, y1) is the top left coordinates, and "
"(x2, y2) is the top right coordinates, and"
"(x3, y3) is the bottom right coordinates, and"
"(x4, y4) is the bottom left coordinates.");
AddOutput(
"Out",
"(Tensor), "
"The output of ROIPerspectiveTransformOp is a 4-D tensor with shape "
"(num_rois, channels, transformed_h, transformed_w).");
AddAttr<float>("spatial_scale",
"(float, default 1.0), "
"Spatial scale factor to scale ROI coords.")
.SetDefault(1.0);
AddAttr<int>("transformed_height",
"(int, default 1), "
"The height of transformed output.")
.SetDefault(1);
AddAttr<int>("transformed_width",
"(int, default 1), "
"The width of transformed output.")
.SetDefault(1);
AddComment(R"DOC(
**ROIPerspectiveTransform Operator**
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp,
ops::ROIPerspectiveTransformOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(roi_perspective_transform_grad,
ops::ROIPerspectiveTransformGradOp);
REGISTER_OP_CPU_KERNEL(roi_perspective_transform,
ops::CPUROIPerspectiveTransformOpKernel<float>);
REGISTER_OP_CPU_KERNEL(roi_perspective_transform_grad,
ops::CPUROIPerspectiveTransformGradOpKernel<float>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
// CUDA: index helpers
#define idx4_4(index, d1, d2, d3, d4) (index % d4)
#define idx4_3(index, d1, d2, d3, d4) ((index / d4) % d3)
#define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2)
#define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1)
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__device__ bool GT_E(T a, T b) {
return (a > b) || fabs(a - b) < 1e-4;
}
template <typename T>
__device__ bool LT_E(T a, T b) {
return (a < b) || fabs(a - b) < 1e-4;
}
template <typename T>
__device__ bool GT(T a, T b) {
return (a - b) > 1e-4;
}
template <typename T>
__device__ T max(T a, T b) {
return a > b ? a : b;
}
template <typename T>
__device__ T min(T a, T b) {
return a < b ? a : b;
}
/*
* check if (x, y) is in the boundary of roi
*/
template <typename T>
__device__ bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
for (int i = 0; i < 4; i++) {
T start_w = roi_x[i];
T start_h = roi_y[i];
T end_w = roi_x[(i + 1) % 4];
T end_h = roi_y[(i + 1) % 4];
if (fabs(start_h - end_h) < 1e-4) {
if (fabs(y - start_h) < 1e-4 && fabs(y - end_h) < 1e-4 &&
GT_E<T>(x, min<T>(start_w, end_w)) &&
LT_E<T>(x, max<T>(start_w, end_w))) {
return true;
}
} else {
T intersec_x =
(y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w;
if (fabs(intersec_x - x) < 1e-4 && GT_E(y, min<T>(start_h, end_h)) &&
LT_E<T>(y, max<T>(start_h, end_h))) {
return true;
}
}
}
int n_cross = 0;
for (int i = 0; i < 4; i++) {
T start_w = roi_x[i];
T start_h = roi_y[i];
T end_w = roi_x[(i + 1) % 4];
T end_h = roi_y[(i + 1) % 4];
if (fabs(start_h - end_h) < 1e-4) {
continue;
}
if (LT_E<T>(y, min<T>(start_h, end_h)) ||
GT<T>(y, max<T>(start_h, end_h))) {
continue;
}
T intersec_x =
(y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w;
if (fabs(intersec_x - x) < 1e-4) {
return true;
}
if (GT<T>(intersec_x, x)) {
n_cross++;
}
}
return (n_cross % 2 == 1);
}
/**
* Perform bilinear interpolation in the input feature map.
*/
template <typename T>
__device__ void bilinear_interpolate(const T* in_data, const int channels,
const int width, const int height,
int in_n, int in_c, T in_w, T in_h,
T* val) {
// Deal with cases that source coords are out of feature map boundary
if (GT<T>(-0.5, in_w) || GT<T>(in_w, width - 0.5) || GT<T>(-0.5, in_h) ||
GT<T>(in_h, height - 0.5)) {
val[0] = 0.0;
return;
}
if (GT<T>(0, in_w)) {
in_w = 0;
}
if (GT<T>(0, in_h)) {
in_h = 0;
}
int in_w_floor = floor(in_w);
int in_h_floor = floor(in_h);
int in_w_ceil;
int in_h_ceil;
if (GT_E<T>(in_w_floor, width - 1)) {
in_w_ceil = in_w_floor = width - 1;
in_w = static_cast<T>(in_w_floor);
} else {
in_w_ceil = in_w_floor + 1;
}
if (GT_E<T>(in_h_floor, height - 1)) {
in_h_ceil = in_h_floor = height - 1;
in_h = static_cast<T>(in_h_floor);
} else {
in_h_ceil = in_h_floor + 1;
}
T w_floor = in_w - in_w_floor;
T h_floor = in_h - in_h_floor;
T w_ceil = 1 - w_floor;
T h_ceil = 1 - h_floor;
const T* data = in_data + (in_n * channels + in_c) * height * width;
// Do bilinear interpolation
T v1 = data[in_h_floor * width + in_w_floor];
T v2 = data[in_h_ceil * width + in_w_floor];
T v3 = data[in_h_ceil * width + in_w_ceil];
T v4 = data[in_h_floor * width + in_w_ceil];
T w1 = w_ceil * h_ceil;
T w2 = w_ceil * h_floor;
T w3 = w_floor * h_floor;
T w4 = w_floor * h_ceil;
val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
}
/**
* Get the source coordinates in the input feature map.
*
* (u, v, w)^matrix = T * (out_w, out_h, 1)^matrix
*
* in_w = u / w
* in_h = v / w
*
*/
template <typename T>
__device__ void get_source_coords(T matrix[], int out_w, int out_h, T* in_w,
T* in_h) {
T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
in_w[0] = u / w;
in_h[0] = v / w;
}
/**
* Get the matrix of perspective transform.
*
* dx1 = x1 - x2
* dx2 = x3 - x2
* dx3 = x0 - x1 + x2 - x3
* dy1 = y1 - y2
* dy2 = y3 - y2
* dy3 = y0 - y1 + y2 - y3
*
* a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1)
* a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1)
* a13 = x0
* a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1)
* a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1)
* a23 = y0
* a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
* a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
* a33 = 1
*
*/
template <typename T>
__device__ void get_transform_matrix(const int transformed_width,
const int transformed_height, T roi_x[],
T roi_y[], T matrix[]) {
T x0 = roi_x[0];
T x1 = roi_x[1];
T x2 = roi_x[2];
T x3 = roi_x[3];
T y0 = roi_y[0];
T y1 = roi_y[1];
T y2 = roi_y[2];
T y3 = roi_y[3];
// Estimate the height and width of RoI
T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
T estimated_height = (len2 + len4) / 2.0;
T estimated_width = (len1 + len3) / 2.0;
// Get the normalized height and normalized width
int normalized_height = transformed_height;
int normalized_width =
round(estimated_width * (normalized_height - 1) / estimated_height) + 1;
normalized_width = min(normalized_width, transformed_width);
T dx1 = x1 - x2;
T dx2 = x3 - x2;
T dx3 = x0 - x1 + x2 - x3;
T dy1 = y1 - y2;
T dy2 = y3 - y2;
T dy3 = y0 - y1 + y2 - y3;
matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) /
(normalized_width - 1);
matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) /
(normalized_height - 1);
matrix[8] = 1;
matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
(normalized_width - 1);
matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
(normalized_height - 1);
matrix[5] = y0;
matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
(normalized_width - 1);
matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
(normalized_height - 1);
matrix[2] = x0;
}
template <typename T>
__global__ void RoiTransformKernel(const float* input_data,
const float* rois_data,
const int* roi2image_data, int num_rois,
int in_height, int in_width, int channels,
int transformed_height,
int transformed_width, float spatial_scale,
T* output_data) {
int output_size =
num_rois * transformed_height * transformed_width * channels;
CUDA_1D_KERNEL_LOOP(index, output_size) {
// (n, c, out_h, out_w) is an element in the transformed output
int out_w = idx4_4(index, num_rois, channels, transformed_height,
transformed_width);
int out_h = idx4_3(index, num_rois, channels, transformed_height,
transformed_width);
int c = idx4_2(index, num_rois, channels, transformed_height,
transformed_width);
int n = idx4_1(index, num_rois, channels, transformed_height,
transformed_width);
auto bottom_rois = rois_data + n * 8;
int roi_batch_ind = bottom_rois[0];
T roi_x[4];
T roi_y[4];
for (int k = 0; k < 4; ++k) {
roi_x[k] = bottom_rois[2 * k] * spatial_scale;
roi_y[k] = bottom_rois[2 * k + 1] * spatial_scale;
}
// Get transform matrix
T matrix[9];
get_transform_matrix<T>(transformed_width, transformed_height, roi_x, roi_y,
matrix);
// Get source coords
T in_w;
T in_h;
get_source_coords<T>(matrix, out_w, out_h, &in_w, &in_h);
if (in_quad<T>(in_w, in_h, roi_x, roi_y)) {
if (GT<T>(-0.5, in_w) || GT<T>(in_w, static_cast<T>(in_width - 0.5)) ||
GT<T>(-0.5, in_h) || GT<T>(in_h, static_cast<T>(in_height - 0.5))) {
// Skip if source coords is not in input image
output_data[index] = 0.0;
} else {
// Perform bilinear interpolation
int in_n = roi2image_data[n];
bilinear_interpolate<T>(input_data, channels, in_width, in_height, in_n,
c, in_w, in_h, output_data + index);
}
} else {
// Skip if source coords is not in quad
output_data[index] = 0.0;
}
}
}
template <typename T>
class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out = ctx.Output<framework::Tensor>("Out");
auto transformed_height = ctx.Attr<int>("transformed_height");
auto transformed_width = ctx.Attr<int>("transformed_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto in_dims = in->dims();
int batch_size = in_dims[0];
int channels = in_dims[1];
int in_height = in_dims[2];
int in_width = in_dims[3];
int rois_num = rois->dims()[0];
const T* input_data = in->data<T>();
T* output_data = out->mutable_data<T>(ctx.GetPlace());
const T* rois_data = rois->data<T>();
framework::Tensor roi2image;
framework::Tensor roi2image_dev;
roi2image.Resize({rois_num});
int* roi2image_data = roi2image.mutable_data<int>(platform::CPUPlace());
auto lod = rois->lod().back();
for (int i = 0; i < lod.size() - 1; ++i) {
for (int j = lod[i]; j < lod[i + 1]; ++j) {
roi2image_data[j] = i;
}
}
TensorCopySync(roi2image, ctx.GetPlace(), &roi2image_dev);
int out_size = rois_num * transformed_height * transformed_width * channels;
auto stream = ctx.cuda_device_context().stream();
int block = 512;
int grid = (out_size + block - 1) / block;
RoiTransformKernel<T><<<grid, block, 0, stream>>>(
input_data, rois_data, roi2image_dev.data<int>(), rois_num, in_height,
in_width, channels, transformed_height, transformed_width,
spatial_scale, output_data);
}
};
template <typename T>
__device__ T get_feature_gradient(T xs, T ys, int w, int h, const int width,
const int height) {
if (GT<T>(-0.5, xs) || GT<T>(xs, width - 0.5) || GT<T>(-0.5, ys) ||
GT<T>(ys, height - 0.5)) {
return 0;
}
if (GT<T>(0, xs)) {
xs = 0;
}
if (GT<T>(0, ys)) {
ys = 0;
}
int xs_floor = floor(xs);
int ys_floor = floor(ys);
int xs_ceil;
int ys_ceil;
if (GT_E<T>(xs_floor, width - 1)) {
xs_ceil = xs_floor = width - 1;
xs = static_cast<T>(xs_floor);
} else {
xs_ceil = xs_floor + 1;
}
if (GT_E(ys_floor, height - 1)) {
ys_ceil = ys_floor = height - 1;
ys = static_cast<T>(ys_floor);
} else {
ys_ceil = ys_floor + 1;
}
T weight = 0;
if (w == xs_floor) {
if (h == ys_floor) {
weight = (w + 1 - xs) * (h + 1 - ys);
} else if (h == ys_ceil) {
weight = (w + 1 - xs) * (ys + 1 - h);
}
} else if (w == xs_ceil) {
if (h == ys_floor) {
weight = (xs + 1 - w) * (h + 1 - ys);
} else if (h == ys_ceil) {
weight = (xs + 1 - w) * (ys + 1 - h);
}
}
return weight;
}
template <typename T>
__global__ void RoiTransformGradKernel(
const size_t* lod, const T* rois_data, int batch_size, int num_rois,
int in_height, int in_width, int channels, int transformed_height,
int transformed_width, float spatial_scale, const T* out_grad_data,
T* in_grad_data) {
int input_size = batch_size * in_height * in_width * channels;
CUDA_1D_KERNEL_LOOP(index, input_size) {
// (n, c, h, w) coords in input
int in_w = idx4_4(index, batch_size, channels, in_height, in_width);
int in_h = idx4_3(index, batch_size, channels, in_height, in_width);
int c = idx4_2(index, batch_size, channels, in_height, in_width);
int n = idx4_1(index, batch_size, channels, in_height, in_width);
T gradient = 0.0;
// Accumulate gradient over all RoIs that interpolated this element
for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) {
const T* rois = rois_data + roi_idx * 8;
T roi_x[4];
T roi_y[4];
for (int k = 0; k < 4; ++k) {
roi_x[k] = rois[2 * k] * spatial_scale;
roi_y[k] = rois[2 * k + 1] * spatial_scale;
}
// Get transform matrix
T matrix[9];
get_transform_matrix<T>(transformed_width, transformed_height, roi_x,
roi_y, matrix);
const T* out_grad_ptr =
out_grad_data +
(roi_idx * channels + c) * transformed_height * transformed_width;
for (int out_h = 0; out_h < transformed_height; ++out_h) {
for (int out_w = 0; out_w < transformed_width; ++out_w) {
T src_w;
T src_h;
get_source_coords<T>(matrix, out_w, out_h, &src_w, &src_h);
if (in_quad<T>(src_w, src_h, roi_x, roi_y)) {
if (GT<T>(-0.5, src_w) ||
GT<T>(src_w, static_cast<T>(in_width - 0.5)) ||
GT<T>(-0.5, src_h) ||
GT<T>(src_h, static_cast<T>(in_height - 0.5))) {
continue;
}
T weight = get_feature_gradient<T>(src_w, src_h, in_w, in_h,
in_width, in_height);
gradient +=
out_grad_ptr[out_h * transformed_width + out_w] * weight;
}
}
}
}
in_grad_data[index] = gradient;
}
}
template <typename T>
class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto transformed_height = ctx.Attr<int>("transformed_height");
auto transformed_width = ctx.Attr<int>("transformed_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto in_dims = in->dims();
int batch_size = in_dims[0];
int channels = in_dims[1];
int in_height = in_dims[2];
int in_width = in_dims[3];
int rois_num = rois->dims()[0];
T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
const T* out_grad_data = out_grad->data<T>();
const T* rois_data = rois->data<T>();
auto lod = rois->lod().back();
auto lod_data = lod.CUDAData(ctx.GetPlace());
int in_size = in->numel();
auto stream = ctx.cuda_device_context().stream();
int block = 512;
int grid = (in_size + block - 1) / block;
RoiTransformGradKernel<T><<<grid, block, 0, stream>>>(
lod_data, rois_data, batch_size, rois_num, in_height, in_width,
channels, transformed_height, transformed_width, spatial_scale,
out_grad_data, in_grad_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(roi_perspective_transform,
ops::CUDAROIPerspectiveTransformOpKernel<float>);
REGISTER_OP_CUDA_KERNEL(roi_perspective_transform_grad,
ops::CUDAROIPerspectiveTransformGradOpKernel<float>);
......@@ -39,6 +39,7 @@ __all__ = [
'detection_map',
'rpn_target_assign',
'anchor_generator',
'roi_perspective_transform',
'generate_proposal_labels',
'generate_proposals',
]
......@@ -1262,6 +1263,54 @@ def anchor_generator(input,
return anchor, var
def roi_perspective_transform(input,
rois,
transformed_height,
transformed_width,
spatial_scale=1.0):
"""
ROI perspective transform op.
Args:
input (Variable): The input of ROIPerspectiveTransformOp. The format of
input tensor is NCHW. Where N is batch size, C is the
number of input channels, H is the height of the feature,
and W is the width of the feature.
rois (Variable): ROIs (Regions of Interest) to be transformed. It should be
a 2-D LoDTensor of shape (num_rois, 8). Given as
[[x1, y1, x2, y2, x3, y3, x4, y4], ...], (x1, y1) is the
top left coordinates, and (x2, y2) is the top right
coordinates, and (x3, y3) is the bottom right coordinates,
and (x4, y4) is the bottom left coordinates.
transformed_height (integer): The height of transformed output.
transformed_height (integer): The width of transformed output.
spatial_scale (float): Spatial scale factor to scale ROI coords. Default: 1.0
Returns:
Variable: The output of ROIPerspectiveTransformOp which is a 4-D tensor with shape
(num_rois, channels, transformed_h, transformed_w).
Examples:
.. code-block:: python
out = fluid.layers.roi_perspective_transform(input, rois, 7, 7, 1.0)
"""
helper = LayerHelper('roi_perspective_transform', **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
helper.append_op(
type="roi_perspective_transform",
inputs={"X": input,
"ROIs": rois},
outputs={"Out": out},
attrs={
"transformed_height": transformed_height,
"transformed_width": transformed_width,
"spatial_scale": spatial_scale
})
return out
def generate_proposal_labels(rpn_rois,
gt_classes,
is_crowd,
......
......@@ -573,6 +573,16 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out)
print(str(program))
def test_roi_perspective_transform(self):
program = Program()
with program_guard(program):
x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
rois = layers.data(
name="rois", shape=[8], dtype="float32", lod_level=1)
output = layers.roi_perspective_transform(x, rois, 7, 7, 0.6)
self.assertIsNotNone(output)
print(str(program))
def test_sequence_enumerate(self):
program = Program()
with program_guard(program):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License")
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUWARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import math
import sys
import paddle.compat as cpt
from op_test import OpTest
from math import sqrt
from math import floor
def gt_e(a, b):
return a > b or abs(a - b) < 1e-4
def gt(a, b):
return (a - b) > 1e-4
def lt_e(a, b):
return a < b or abs(a - b) < 1e-4
def in_quad(x, y, roi_x, roi_y):
# check if (x, y) is in the boundary of roi
for i in range(4):
xs = roi_x[i]
ys = roi_y[i]
xe = roi_x[(i + 1) % 4]
ye = roi_y[(i + 1) % 4]
if abs(ys - ye) < 1e-4:
if abs(y - ys) < 1e-4 and abs(y - ye) < 1e-4 and gt_e(
x, min(xs, xe)) and lt_e(x, max(xs, xe)):
return True
else:
intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs
if abs(intersec_x - x) < 1e-4 and gt_e(y, min(ys, ye)) and lt_e(
y, max(ys, ye)):
return True
n_cross = 0
for i in range(4):
xs = roi_x[i]
ys = roi_y[i]
xe = roi_x[(i + 1) % 4]
ye = roi_y[(i + 1) % 4]
if abs(ys - ye) < 1e-4:
continue
if lt_e(y, min(ys, ye)) or gt(y, max(ys, ye)):
continue
intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs
if abs(intersec_x - x) < 1e-4:
return True
if gt(intersec_x, x):
n_cross += 1
return (n_cross % 2 == 1)
def get_transform_matrix(transformed_width, transformed_height, roi_x, roi_y):
x0 = roi_x[0]
x1 = roi_x[1]
x2 = roi_x[2]
x3 = roi_x[3]
y0 = roi_y[0]
y1 = roi_y[1]
y2 = roi_y[2]
y3 = roi_y[3]
len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1))
len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2))
len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3))
len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0))
estimated_height = (len2 + len4) / 2.0
estimated_width = (len1 + len3) / 2.0
normalized_height = transformed_height
normalized_width = round(estimated_width *
(normalized_height - 1) / estimated_height) + 1
normalized_width = min(normalized_width, transformed_width)
dx1 = x1 - x2
dx2 = x3 - x2
dx3 = x0 - x1 + x2 - x3
dy1 = y1 - y2
dy2 = y3 - y2
dy3 = y0 - y1 + y2 - y3
matrix = np.zeros([9])
matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (
normalized_width - 1)
matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (
normalized_height - 1)
matrix[8] = 1
matrix[3] = (y1 - y0 + matrix[6] *
(normalized_width - 1) * y1) / (normalized_width - 1)
matrix[4] = (y3 - y0 + matrix[7] *
(normalized_height - 1) * y3) / (normalized_height - 1)
matrix[5] = y0
matrix[0] = (x1 - x0 + matrix[6] *
(normalized_width - 1) * x1) / (normalized_width - 1)
matrix[1] = (x3 - x0 + matrix[7] *
(normalized_height - 1) * x3) / (normalized_height - 1)
matrix[2] = x0
return matrix
def get_source_coords(matrix, out_w, out_h):
u = matrix[0] * out_w + matrix[1] * out_h + matrix[2]
v = matrix[3] * out_w + matrix[4] * out_h + matrix[5]
w = matrix[6] * out_w + matrix[7] * out_h + matrix[8]
in_w = u / w
in_h = v / w
return in_w, in_h
def bilinear_interpolate(in_data, in_n, in_c, in_w, in_h):
batch_size = in_data.shape[0]
channels = in_data.shape[1]
height = in_data.shape[2]
width = in_data.shape[3]
if gt(-0.5, in_w) or gt(in_w, width - 0.5) or gt(-0.5, in_h) or gt(
in_h, height - 0.5):
return 0.0
if gt(0, in_w):
in_w = 0
if gt(0, in_h):
in_h = 0
in_w_floor = floor(in_w)
in_h_floor = floor(in_h)
if gt_e(in_w_floor, width - 1):
in_w_ceil = width - 1
in_w_floor = width - 1
in_w = in_w_floor
else:
in_w_ceil = in_w_floor + 1
if gt_e(in_h_floor, height - 1):
in_h_ceil = height - 1
in_h_floor = height - 1
in_h = in_h_floor
else:
in_h_ceil = in_h_floor + 1
w_floor = in_w - in_w_floor
h_floor = in_h - in_h_floor
w_ceil = 1 - w_floor
h_ceil = 1 - h_floor
v1 = in_data[in_n][in_c][int(in_h_floor)][int(in_w_floor)]
v2 = in_data[in_n][in_c][int(in_h_ceil)][int(in_w_floor)]
v3 = in_data[in_n][in_c][int(in_h_ceil)][int(in_w_ceil)]
v4 = in_data[in_n][in_c][int(in_h_floor)][int(in_w_ceil)]
w1 = w_ceil * h_ceil
w2 = w_ceil * h_floor
w3 = w_floor * h_floor
w4 = w_floor * h_ceil
val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
return val
def lod_convert(lod):
ret = [0]
for count in lod:
ret.append(ret[-1] + count)
return ret
def roi_transform(in_data, rois, rois_lod, transformed_height,
transformed_width, spatial_scale):
channels = in_data.shape[1]
in_height = in_data.shape[2]
in_width = in_data.shape[3]
rois_num = rois.shape[0]
roi2image = [0] * rois_num
rois_lod = lod_convert(rois_lod[0])
for i in range(len(rois_lod) - 1):
for j in range(rois_lod[i], rois_lod[i + 1]):
roi2image[j] = i
out = np.zeros([rois_num, channels, transformed_height, transformed_width])
for n in range(rois_num):
roi_x = []
roi_y = []
for k in range(4):
roi_x.append(rois[n][2 * k] * spatial_scale)
roi_y.append(rois[n][2 * k + 1] * spatial_scale)
image_id = roi2image[n]
transform_matrix = get_transform_matrix(
transformed_width, transformed_height, roi_x, roi_y)
for c in range(channels):
for out_h in range(transformed_height):
for out_w in range(transformed_width):
in_w, in_h = get_source_coords(transform_matrix, out_w,
out_h)
if in_quad(in_w, in_h, roi_x, roi_y) and gt_e(
in_w, -0.5) and lt_e(in_w, in_width - 0.5) and gt_e(
in_h, -0.5) and lt_e(in_h, in_height - 0.5):
out[n][c][out_h][out_w] = bilinear_interpolate(
in_data, image_id, c, in_w, in_h)
else:
out[n][c][out_h][out_w] = 0.0
return out.astype("float32")
class TestROIPoolOp(OpTest):
def set_data(self):
self.init_test_case()
self.make_rois()
self.inputs = {'X': self.x, 'ROIs': (self.rois, self.rois_lod)}
self.attrs = {
'spatial_scale': self.spatial_scale,
'transformed_height': self.transformed_height,
'transformed_width': self.transformed_width
}
out = roi_transform(self.x, self.rois, self.rois_lod,
self.transformed_height, self.transformed_width,
self.spatial_scale)
self.outputs = {'Out': out}
def init_test_case(self):
self.batch_size = 2
self.channels = 2
self.height = 8
self.width = 8
# n, c, h, w
self.x_dim = (self.batch_size, self.channels, self.height, self.width)
self.spatial_scale = 1.0 / 2.0
self.transformed_height = 2
self.transformed_width = 3
self.x = np.random.random(self.x_dim).astype('float32')
def make_rois(self):
rois = []
self.rois_lod = [[]]
for bno in range(self.batch_size):
self.rois_lod[0].append(bno + 1)
for i in range(bno + 1):
x1 = np.random.randint(
0,
self.width // self.spatial_scale - self.transformed_width)
y1 = np.random.randint(
0,
self.height // self.spatial_scale - self.transformed_height)
x2 = np.random.randint(x1 + self.transformed_width,
self.width // self.spatial_scale)
y2 = np.random.randint(
0,
self.height // self.spatial_scale - self.transformed_height)
x3 = np.random.randint(x1 + self.transformed_width,
self.width // self.spatial_scale)
y3 = np.random.randint(y1 + self.transformed_height,
self.height // self.spatial_scale)
x4 = np.random.randint(
0,
self.width // self.spatial_scale - self.transformed_width)
y4 = np.random.randint(y1 + self.transformed_height,
self.height // self.spatial_scale)
roi = [x1, y1, x2, y2, x3, y3, x4, y4]
rois.append(roi)
self.rois_num = len(rois)
self.rois = np.array(rois).astype("float32")
def setUp(self):
self.op_type = "roi_perspective_transform"
self.set_data()
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册