未验证 提交 1773fbba 编写于 作者: D duanyanhui 提交者: GitHub

enhance grid_sampler to support 3d input (#45015)

* enhance grid_sampler to support 3d input
上级 1cb883da
......@@ -54,13 +54,19 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override {
AddInput("X",
"(Tensor) The input data of GridSampleOp, "
"This is a 4-D tensor with shape of [N, C, H, W]");
"This is a 4-D tensor with shape of [N, C, H, W] or"
" a 5-D tensot with shape of [N, C, D, H, W]");
AddInput(
"Grid",
"(Tensor) The input grid of GridSampleOp generated by AffineGridOp, "
"This is a 4-D tensor with shape of [N, H, W, 2] is the concatenation "
"of x and y coordinates with shape [N, H, W] in last dimension");
AddOutput("Output", "(Tensor) Output tensor with shape [N, C, H, W]");
"of x and y coordinates with shape [N, H, W] in last dimension or "
"a 5-D tensor with shape of [N, D, H, W, 3] is the concatenation "
"of depth, x and y coordinates with shape [N, D, H, W] in last "
"dimension ");
AddOutput("Output",
"(Tensor) Output tensor with shape [N, C, H, W] or shape [N,C, "
"D, H ,W]");
AddAttr<bool>(
"use_cudnn",
"(bool, default true) Only used in cudnn kernel, need install cudnn")
......
......@@ -1288,19 +1288,31 @@ void GridSampleBaseInferMeta(const MetaTensor& x,
MetaConfig config) {
auto x_dims = x.dims();
auto grid_dims = grid.dims();
PADDLE_ENFORCE_EQ(x_dims.size(),
PADDLE_ENFORCE_GE(x_dims.size(),
4,
phi::errors::InvalidArgument(
"Input(X) of GridSampleOp should be 4-D Tensor, but "
"received X dimension size(%d)",
x_dims.size()));
PADDLE_ENFORCE_EQ(grid_dims.size(),
PADDLE_ENFORCE_LE(x_dims.size(),
5,
phi::errors::InvalidArgument(
"Input(X) of GridSampleOp should be 4-D Tensor, but "
"received X dimension size(%d)",
x_dims.size()));
PADDLE_ENFORCE_GE(grid_dims.size(),
4,
phi::errors::InvalidArgument(
"Input(Grid) of GridSampleOp should be 4-D Tensor, "
"but received X dimension size(%d)",
grid_dims.size()));
if (config.is_runtime || grid_dims[3] > 0) {
PADDLE_ENFORCE_LE(grid_dims.size(),
5,
phi::errors::InvalidArgument(
"Input(Grid) of GridSampleOp should be 4-D Tensor, "
"but received X dimension size(%d)",
grid_dims.size()));
if (grid_dims.size() == 4 && (config.is_runtime || grid_dims[3] > 0)) {
PADDLE_ENFORCE_EQ(
grid_dims[3],
2,
......@@ -1308,6 +1320,14 @@ void GridSampleBaseInferMeta(const MetaTensor& x,
"Input(Grid) dimension[3] should be 2, but received %d",
grid_dims[3]));
}
if (grid_dims.size() == 5 && (config.is_runtime || grid_dims[4] > 0)) {
PADDLE_ENFORCE_EQ(
grid_dims[4],
3,
phi::errors::InvalidArgument(
"Input(Grid) dimension[4] should be 3, but received %d",
grid_dims[4]));
}
if (config.is_runtime) {
PADDLE_ENFORCE_EQ(
grid_dims[0],
......@@ -1318,8 +1338,12 @@ void GridSampleBaseInferMeta(const MetaTensor& x,
x_dims[0],
grid_dims[0]));
}
out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
if (grid_dims.size() == 4) {
out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
} else {
out->set_dims(
{x_dims[0], x_dims[1], grid_dims[1], grid_dims[2], grid_dims[3]});
}
out->set_dtype(x.dtype());
out->share_lod(x);
}
......
......@@ -32,6 +32,23 @@ static __forceinline__ __device__ void AtomicAdd(
}
}
template <typename T>
static __forceinline__ __device__ void AtomicAdd3D(T* data,
int64_t d,
int64_t h,
int64_t w,
int64_t sD,
int64_t sH,
int64_t sW,
int64_t D,
int64_t H,
int64_t W,
T delta) {
if (InBounds3D(d, h, w, D, H, W)) {
atomicAdd(data + d * sD + h * sH + w * sW, delta);
}
}
template <typename T>
static __forceinline__ __device__ T
UnnormalizeWithMask(T coord, int size, bool align_corners, T* grad_in) {
......@@ -249,6 +266,305 @@ __global__ void GridSamplerCudaBackwardKernel(const int nthreads,
}
}
template <typename T>
__global__ void GridSampler3DCudaBackwardKernel(const int nthreads,
const T* grad_output,
const T* input,
const T* grid,
int out_c,
int out_d,
int out_h,
int out_w,
int in_d,
int in_h,
int in_w,
T* grad_input,
T* grad_grid,
const Mode mode,
const PaddingMode padding_mode,
bool align_corners) {
int inp_sW = 1;
int inp_sH = in_w;
int inp_sD = in_h * in_w;
int inp_sC = in_d * inp_sD;
int inp_sN = out_c * inp_sC;
int grid_sCoor = 1;
int grid_sW = 3;
int grid_sH = out_w * grid_sW;
int grid_sD = out_h * grid_sH;
int grid_sN = out_d * grid_sD;
int gOut_sW = 1;
int gOut_sH = out_w;
int gOut_sD = out_h * out_w;
int gOut_sC = out_d * gOut_sD;
int gOut_sN = out_c * gOut_sC;
CUDA_KERNEL_LOOP_TYPE(index, nthreads, int) {
const int w = index % out_w;
const int h = (index / out_w) % out_h;
const int d = (index / (out_h * out_w)) % out_d;
const int n = index / (out_d * out_h * out_w);
const auto grid_offset =
n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
// get the corresponding input x, y, z co-ordinates from grid
T ix = grid[grid_offset];
T iy = grid[grid_offset + grid_sCoor];
T iz = grid[grid_offset + 2 * grid_sCoor];
// multipliers for gradients on ix, iy, and iz
T gix_mult, giy_mult, giz_mult;
ix = ComputePositionsWithMask(
ix, in_w, padding_mode, align_corners, &gix_mult);
iy = ComputePositionsWithMask(
iy, in_h, padding_mode, align_corners, &giy_mult);
iz = ComputePositionsWithMask(
iz, in_d, padding_mode, align_corners, &giz_mult);
if (mode == Mode::bilinear) {
// get corner pixel values from (x, y, z)
// for 4d, we used north-east-south-west
// for 5d, we add top-bottom
int ix_tnw = static_cast<int>(std::floor(ix));
int iy_tnw = static_cast<int>(std::floor(iy));
int iz_tnw = static_cast<int>(std::floor(iz));
int ix_tne = ix_tnw + 1;
int iy_tne = iy_tnw;
int iz_tne = iz_tnw;
int ix_tsw = ix_tnw;
int iy_tsw = iy_tnw + 1;
int iz_tsw = iz_tnw;
int ix_tse = ix_tnw + 1;
int iy_tse = iy_tnw + 1;
int iz_tse = iz_tnw;
int ix_bnw = ix_tnw;
int iy_bnw = iy_tnw;
int iz_bnw = iz_tnw + 1;
int ix_bne = ix_tnw + 1;
int iy_bne = iy_tnw;
int iz_bne = iz_tnw + 1;
int ix_bsw = ix_tnw;
int iy_bsw = iy_tnw + 1;
int iz_bsw = iz_tnw + 1;
int ix_bse = ix_tnw + 1;
int iy_bse = iy_tnw + 1;
int iz_bse = iz_tnw + 1;
// get surfaces to each neighbor:
T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
T gix = static_cast<T>(0), giy = static_cast<T>(0),
giz = static_cast<T>(0);
int gOut_offset = n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
int inp_offset_NC = n * inp_sN;
T* gInp_ptr_NC = grad_input + n * inp_sN;
for (int c = 0; c < out_c; ++c,
gOut_offset += gOut_sC,
gInp_ptr_NC += inp_sC,
inp_offset_NC += inp_sC) {
T gOut = grad_output[gOut_offset];
AtomicAdd3D(gInp_ptr_NC,
iz_tnw,
iy_tnw,
ix_tnw,
inp_sD,
inp_sH,
inp_sW,
in_d,
in_h,
in_w,
tnw * gOut);
AtomicAdd3D(gInp_ptr_NC,
iz_tne,
iy_tne,
ix_tne,
inp_sD,
inp_sH,
inp_sW,
in_d,
in_h,
in_w,
tne * gOut);
AtomicAdd3D(gInp_ptr_NC,
iz_tsw,
iy_tsw,
ix_tsw,
inp_sD,
inp_sH,
inp_sW,
in_d,
in_h,
in_w,
tsw * gOut);
AtomicAdd3D(gInp_ptr_NC,
iz_tse,
iy_tse,
ix_tse,
inp_sD,
inp_sH,
inp_sW,
in_d,
in_h,
in_w,
tse * gOut);
AtomicAdd3D(gInp_ptr_NC,
iz_bnw,
iy_bnw,
ix_bnw,
inp_sD,
inp_sH,
inp_sW,
in_d,
in_h,
in_w,
bnw * gOut);
AtomicAdd3D(gInp_ptr_NC,
iz_bne,
iy_bne,
ix_bne,
inp_sD,
inp_sH,
inp_sW,
in_d,
in_h,
in_w,
bne * gOut);
AtomicAdd3D(gInp_ptr_NC,
iz_bsw,
iy_bsw,
ix_bsw,
inp_sD,
inp_sH,
inp_sW,
in_d,
in_h,
in_w,
bsw * gOut);
AtomicAdd3D(gInp_ptr_NC,
iz_bse,
iy_bse,
ix_bse,
inp_sD,
inp_sH,
inp_sW,
in_d,
in_h,
in_w,
bse * gOut);
// calculate grad_grid
if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
T tnw_val = input[inp_offset_NC + iz_tnw * inp_sD + iy_tnw * inp_sH +
ix_tnw * inp_sW];
gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut;
giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut;
giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut;
}
if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
T tne_val = input[inp_offset_NC + iz_tne * inp_sD + iy_tne * inp_sH +
ix_tne * inp_sW];
gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut;
giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut;
giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut;
}
if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
T tsw_val = input[inp_offset_NC + iz_tsw * inp_sD + iy_tsw * inp_sH +
ix_tsw * inp_sW];
gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut;
giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut;
giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut;
}
if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
T tse_val = input[inp_offset_NC + iz_tse * inp_sD + iy_tse * inp_sH +
ix_tse * inp_sW];
gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut;
giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut;
giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut;
}
if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
T bnw_val = input[inp_offset_NC + iz_bnw * inp_sD + iy_bnw * inp_sH +
ix_bnw * inp_sW];
gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut;
giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut;
giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut;
}
if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
T bne_val = input[inp_offset_NC + iz_bne * inp_sD + iy_bne * inp_sH +
ix_bne * inp_sW];
gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut;
giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut;
giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut;
}
if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
T bsw_val = input[inp_offset_NC + iz_bsw * inp_sD + iy_bsw * inp_sH +
ix_bsw * inp_sW];
gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut;
giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut;
giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut;
}
if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
T bse_val = input[inp_offset_NC + iz_bse * inp_sD + iy_bse * inp_sH +
ix_bse * inp_sW];
gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut;
giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut;
giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut;
}
}
if (grad_grid != nullptr) {
T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
gGrid_ptr_NDHW[0] = gix_mult * gix;
gGrid_ptr_NDHW[1] = giy_mult * giy;
gGrid_ptr_NDHW[2] = giz_mult * giz;
}
} else if (mode == Mode::nearest) {
auto ix_nearest = static_cast<int>(std::round(ix));
auto iy_nearest = static_cast<int>(std::round(iy));
auto iz_nearest = static_cast<int>(std::round(iz));
// assign nearest neighor pixel value to output pixel
int gOut_offset = n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
T* gInp_ptr_NC = grad_input + n * inp_sN;
for (int c = 0; c < out_c;
++c, gOut_offset += gOut_sC, gInp_ptr_NC += inp_sC) {
AtomicAdd3D(gInp_ptr_NC,
iz_nearest,
iy_nearest,
ix_nearest,
inp_sD,
inp_sH,
inp_sW,
in_d,
in_h,
in_w,
grad_output[gOut_offset]);
}
if (grad_grid != nullptr) {
T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
gGrid_ptr_NDHW[0] = static_cast<T>(0);
gGrid_ptr_NDHW[1] = static_cast<T>(0);
gGrid_ptr_NDHW[2] = static_cast<T>(0);
}
}
}
}
template <typename T, typename Context>
void GridSampleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
......@@ -275,42 +591,84 @@ void GridSampleGradKernel(const Context& dev_ctx,
enum_mode = Mode::bilinear;
}
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
const int c = x.dims()[1];
const int in_h = x.dims()[2];
const int in_w = x.dims()[3];
if (x.dims().size() == 4) {
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
const int c = x.dims()[1];
const int in_h = x.dims()[2];
const int in_w = x.dims()[3];
dev_ctx.template Alloc<T>(x_grad);
phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
dev_ctx.template Alloc<T>(x_grad);
phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
T* grid_grad_data = nullptr;
if (grid_grad != nullptr) {
grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
}
T* grid_grad_data = nullptr;
if (grid_grad != nullptr) {
grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
}
int count = static_cast<int>(n * out_h * out_w);
auto cu_stream = dev_ctx.stream();
backends::gpu::GpuLaunchConfig config =
backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
GridSamplerCudaBackwardKernel<T>
<<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count,
out_grad.data<T>(),
x.data<T>(),
grid.data<T>(),
n,
c,
out_h,
out_w,
in_h,
in_w,
x_grad->data<T>(),
grid_grad_data,
enum_mode,
enum_padding_mode,
align_corners);
} else {
const int out_d = grid.dims()[1];
const int out_h = grid.dims()[2];
const int out_w = grid.dims()[3];
const int n = x.dims()[0];
const int c = x.dims()[1];
const int in_d = x.dims()[2];
const int in_h = x.dims()[3];
const int in_w = x.dims()[4];
int count = static_cast<int>(n * out_h * out_w);
auto cu_stream = dev_ctx.stream();
backends::gpu::GpuLaunchConfig config =
backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
GridSamplerCudaBackwardKernel<T>
<<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count,
out_grad.data<T>(),
x.data<T>(),
grid.data<T>(),
n,
c,
out_h,
out_w,
in_h,
in_w,
x_grad->data<T>(),
grid_grad_data,
enum_mode,
enum_padding_mode,
align_corners);
dev_ctx.template Alloc<T>(x_grad);
phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
T* grid_grad_data = nullptr;
if (grid_grad != nullptr) {
grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
}
int count = static_cast<int>(n * out_d * out_h * out_w);
auto cu_stream = dev_ctx.stream();
backends::gpu::GpuLaunchConfig config =
backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
GridSampler3DCudaBackwardKernel<T>
<<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count,
out_grad.data<T>(),
x.data<T>(),
grid.data<T>(),
c,
out_d,
out_h,
out_w,
in_d,
in_h,
in_w,
x_grad->data<T>(),
grid_grad_data,
enum_mode,
enum_padding_mode,
align_corners);
}
}
} // namespace phi
......
......@@ -169,6 +169,169 @@ __global__ void GridSampleCudaKernel(const int nthreads,
}
}
template <typename T>
__global__ void GridSample3DCudaKernel(const int nthreads,
int out_c,
int out_d,
int out_h,
int out_w,
int in_d,
int in_h,
int in_w,
const T* input,
const T* grid,
T* output,
const Mode interpolation_mode,
const PaddingMode padding_mode,
bool align_corners) {
int inp_sW = 1;
int inp_sH = in_w;
int inp_sD = in_h * in_w;
int inp_sC = in_d * inp_sD;
int inp_sN = out_c * inp_sC;
int grid_sCoor = 1;
int grid_sW = 3;
int grid_sH = out_w * grid_sW;
int grid_sD = out_h * grid_sH;
int grid_sN = out_d * grid_sD;
int out_sW = 1;
int out_sH = out_w;
int out_sD = out_h * out_w;
int out_sC = out_d * out_sD;
int out_sN = out_c * out_sC;
CUDA_KERNEL_LOOP_TYPE(index, nthreads, int) {
const int w = index % out_w;
const int h = (index / out_w) % out_h;
const int d = (index / (out_h * out_w)) % out_d;
const int n = index / (out_d * out_h * out_w);
const int grid_offset =
n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
// get the corresponding input x, y, z co-ordinates from grid
T ix = grid[grid_offset];
T iy = grid[grid_offset + grid_sCoor];
T iz = grid[grid_offset + 2 * grid_sCoor];
ix = ComputePositions(ix, in_w, padding_mode, align_corners);
iy = ComputePositions(iy, in_h, padding_mode, align_corners);
iz = ComputePositions(iz, in_d, padding_mode, align_corners);
if (interpolation_mode == Mode::bilinear) {
// get corner pixel values from (x, y, z)
// for 4d, we used north-east-south-west
// for 5d, we add top-bottom
int ix_tnw = static_cast<int>(std::floor(ix));
int iy_tnw = static_cast<int>(std::floor(iy));
int iz_tnw = static_cast<int>(std::floor(iz));
int ix_tne = ix_tnw + 1;
int iy_tne = iy_tnw;
int iz_tne = iz_tnw;
int ix_tsw = ix_tnw;
int iy_tsw = iy_tnw + 1;
int iz_tsw = iz_tnw;
int ix_tse = ix_tnw + 1;
int iy_tse = iy_tnw + 1;
int iz_tse = iz_tnw;
int ix_bnw = ix_tnw;
int iy_bnw = iy_tnw;
int iz_bnw = iz_tnw + 1;
int ix_bne = ix_tnw + 1;
int iy_bne = iy_tnw;
int iz_bne = iz_tnw + 1;
int ix_bsw = ix_tnw;
int iy_bsw = iy_tnw + 1;
int iz_bsw = iz_tnw + 1;
int ix_bse = ix_tnw + 1;
int iy_bse = iy_tnw + 1;
int iz_bse = iz_tnw + 1;
// get surfaces to each neighbor:
T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
auto inp_ptr_NC = input + n * inp_sN;
auto out_ptr_NCDHW =
output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
for (int c = 0; c < out_c;
++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
*out_ptr_NCDHW = static_cast<T>(0);
if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] *
tnw;
}
if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] *
tne;
}
if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] *
tsw;
}
if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] *
tse;
}
if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] *
bnw;
}
if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] *
bne;
}
if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] *
bsw;
}
if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
*out_ptr_NCDHW +=
inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] *
bse;
}
}
} else if (interpolation_mode == Mode::nearest) {
int ix_nearest = static_cast<int>(std::round(ix));
int iy_nearest = static_cast<int>(std::round(iy));
int iz_nearest = static_cast<int>(std::round(iz));
// assign nearest neighor pixel value to output pixel
auto inp_ptr_NC = input + n * inp_sN;
auto out_ptr_NCDHW =
output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
for (int c = 0; c < out_c;
++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
if (InBounds3D(iz_nearest, iy_nearest, ix_nearest, in_d, in_h, in_w)) {
*out_ptr_NCDHW =
inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH +
ix_nearest * inp_sW];
} else {
*out_ptr_NCDHW = static_cast<T>(0);
}
}
}
}
}
template <typename T, typename Context>
void GridSampleKernel(const Context& dev_ctx,
const DenseTensor& x,
......@@ -193,38 +356,78 @@ void GridSampleKernel(const Context& dev_ctx,
enum_mode = Mode::bilinear;
}
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
const int c = x.dims()[1];
const int in_h = x.dims()[2];
const int in_w = x.dims()[3];
VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
<< "; out_w: " << out_w;
auto* output_data = dev_ctx.template Alloc<T>(out);
VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
<< out->dims()[2] << "; " << out->dims()[3];
int count = static_cast<int>(n * out_h * out_w);
auto cu_stream = dev_ctx.stream();
backends::gpu::GpuLaunchConfig config =
backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
GridSampleCudaKernel<T>
<<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count,
n,
c,
out_h,
out_w,
in_h,
in_w,
x.data<T>(),
grid.data<T>(),
output_data,
enum_mode,
enum_padding_mode,
align_corners);
if (x.dims().size() == 4) {
const int n = grid.dims()[0];
const int out_h = grid.dims()[1];
const int out_w = grid.dims()[2];
const int c = x.dims()[1];
const int in_h = x.dims()[2];
const int in_w = x.dims()[3];
VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
<< "; out_w: " << out_w;
auto* output_data = dev_ctx.template Alloc<T>(out);
VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
<< out->dims()[2] << "; " << out->dims()[3];
int count = static_cast<int>(n * out_h * out_w);
auto cu_stream = dev_ctx.stream();
backends::gpu::GpuLaunchConfig config =
backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
GridSampleCudaKernel<T>
<<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count,
n,
c,
out_h,
out_w,
in_h,
in_w,
x.data<T>(),
grid.data<T>(),
output_data,
enum_mode,
enum_padding_mode,
align_corners);
} else {
const int n = grid.dims()[0];
const int out_d = grid.dims()[1];
const int out_h = grid.dims()[2];
const int out_w = grid.dims()[3];
const int c = x.dims()[1];
const int in_d = x.dims()[2];
const int in_h = x.dims()[3];
const int in_w = x.dims()[4];
VLOG(3) << "n: " << n << "; c: " << c << "; out_d: " << out_d
<< "; out_h: " << out_h << "; out_w: " << out_w;
auto* output_data = dev_ctx.template Alloc<T>(out);
VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
<< out->dims()[2] << "; " << out->dims()[3] << "; "
<< out->dims()[4];
int count = static_cast<int>(n * out_d * out_h * out_w);
auto cu_stream = dev_ctx.stream();
backends::gpu::GpuLaunchConfig config =
backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
GridSample3DCudaKernel<T>
<<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count,
c,
out_d,
out_h,
out_w,
in_d,
in_h,
in_w,
x.data<T>(),
grid.data<T>(),
output_data,
enum_mode,
enum_padding_mode,
align_corners);
}
}
} // namespace phi
......
......@@ -27,4 +27,9 @@ static __forceinline__ __device__ bool InBounds(int h, int w, int H, int W) {
return h >= 0 && h < H && w >= 0 && w < W;
}
static __forceinline__ __device__ bool InBounds3D(
int d, int h, int w, int D, int H, int W) {
return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
}
} // namespace phi
......@@ -20,6 +20,15 @@ from op_test import OpTest, skip_check_grad_ci
paddle.enable_static()
from white_list import (
op_accuracy_white_list,
check_shape_white_list,
compile_vs_runtime_white_list,
no_check_set_white_list,
op_threshold_white_list,
no_grad_set_white_list,
)
def AffineGrid(theta, grid_shape):
n = grid_shape[0]
......@@ -64,6 +73,68 @@ def getGridPointValue(data, x, y):
return out
def AffineGrid3D(theta, grid_shape):
n = grid_shape[0]
d = grid_shape[1]
h = grid_shape[2]
w = grid_shape[3]
d_idx = np.repeat(np.repeat(np.linspace(-1, 1, d)[:, np.newaxis,
np.newaxis],
h,
axis=1),
w,
axis=2)[:, :, :, np.newaxis]
h_idx = np.repeat(np.repeat(np.linspace(-1, 1, h)[np.newaxis, :,
np.newaxis],
w,
axis=2),
d,
axis=0)[:, :, :, np.newaxis]
w_idx = np.repeat(np.repeat(np.linspace(-1, 1, w)[np.newaxis,
np.newaxis, :],
h,
axis=1),
d,
axis=0)[:, :, :, np.newaxis]
grid = np.concatenate(
[w_idx, h_idx, d_idx, np.ones([d, h, w, 1])], axis=3) # d * h * w * 4
grid = np.repeat(grid[np.newaxis, :], n, axis=0) # n * d * h * w *4
ret = np.zeros([n, d * h * w, 3])
theta = theta.transpose([0, 2, 1])
for i in range(len(theta)):
ret[i] = np.dot(grid[i].reshape([d * h * w, 4]), theta[i])
return ret.reshape([n, d, h, w, 3]).astype("float64")
def getGridPointValue3D(data, x, y, z):
data_shape = data.shape
N = data_shape[0]
C = data_shape[1]
in_D = data_shape[2]
in_H = data_shape[3]
in_W = data_shape[4]
out_D = x.shape[1]
out_H = x.shape[2]
out_W = x.shape[3]
#out = np.zeros(data_shape, dtype='float64')
out = np.zeros([N, C, out_D, out_H, out_W], dtype='float64')
for i in range(N):
for j in range(out_D):
for k in range(out_H):
for l in range(out_W):
if y[i, j, k, l] < 0 or y[i, j, k, l] > in_H - 1 or x[
i, j, k, l] < 0 or x[i, j, k, l] > in_W - 1 or z[
i, j, k, l] < 0 or z[i, j, k, l] > in_D - 1:
out[i, :, j, k, l] = 0
else:
out[i, :, j, k, l] = data[i, :, z[i, j, k, l],
y[i, j, k, l], x[i, j, k, l]]
return out
def clip(x, min_n, max_n):
return np.maximum(np.minimum(x, max_n), min_n)
......@@ -138,6 +209,80 @@ def GridSampler(data,
return out
def GridSampler3D(data,
grid,
align_corners=True,
mode="bilinear",
padding_mode="zeros"):
dims = data.shape
N = dims[0]
in_C = dims[1]
in_D = dims[2]
in_H = dims[3]
in_W = dims[4]
out_D = grid.shape[1]
out_H = grid.shape[2]
out_W = grid.shape[3]
x = grid[:, :, :, :, 0]
y = grid[:, :, :, :, 1]
z = grid[:, :, :, :, 2]
z_max = in_D - 1
y_max = in_H - 1
x_max = in_W - 1
x = unnormalizeAndClip(x, x_max, align_corners, padding_mode)
y = unnormalizeAndClip(y, y_max, align_corners, padding_mode)
z = unnormalizeAndClip(z, z_max, align_corners, padding_mode)
if mode == "bilinear":
x0 = np.floor(x).astype('int32')
x1 = x0 + 1
y0 = np.floor(y).astype('int32')
y1 = y0 + 1
z0 = np.floor(z).astype('int32')
z1 = z0 + 1
w_tnw = np.tile(((x1 - x) * (y1 - y) * (z1 - z)).reshape(
(N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
w_tne = np.tile(((x - x0) * (y1 - y) * (z1 - z)).reshape(
(N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
w_tsw = np.tile(((x1 - x) * (y - y0) * (z1 - z)).reshape(
(N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
w_tse = np.tile(((x - x0) * (y - y0) * (z1 - z)).reshape(
(N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
w_bnw = np.tile(((x1 - x) * (y1 - y) * (z - z0)).reshape(
(N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
w_bne = np.tile(((x - x0) * (y1 - y) * (z - z0)).reshape(
(N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
w_bsw = np.tile(((x1 - x) * (y - y0) * (z - z0)).reshape(
(N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
w_bse = np.tile(((x - x0) * (y - y0) * (z - z0)).reshape(
(N, 1, out_D, out_H, out_W)), (1, in_C, 1, 1, 1))
v_tnw = getGridPointValue3D(data, x0, y0, z0)
v_tne = getGridPointValue3D(data, x1, y0, z0)
v_tsw = getGridPointValue3D(data, x0, y1, z0)
v_tse = getGridPointValue3D(data, x1, y1, z0)
v_bnw = getGridPointValue3D(data, x0, y0, z1)
v_bne = getGridPointValue3D(data, x1, y0, z1)
v_bsw = getGridPointValue3D(data, x0, y1, z1)
v_bse = getGridPointValue3D(data, x1, y1, z1)
out = (w_tnw * v_tnw + w_tne * v_tne + w_tsw * v_tsw + w_tse * v_tse +
w_bnw * v_bnw + w_bne * v_bne + w_bsw * v_bsw +
w_bse * v_bse).astype('float64')
elif mode == "nearest":
x = np.round(x).astype('int32')
y = np.round(y).astype('int32')
z = np.round(z).astype('int32')
out = getGridPointValue3D(data, x, y, z)
return out
class TestGridSamplerOp(OpTest):
def setUp(self):
......@@ -150,36 +295,90 @@ class TestGridSamplerOp(OpTest):
self.mode = "bilinear"
self.initTestCase()
x = np.random.randint(0, 255, self.x_shape).astype('float64')
theta = np.zeros(self.theta_shape).astype('float64')
for i in range(self.theta_shape[0]):
for j in range(2):
for k in range(3):
theta[i, j, k] = np.random.rand(1)[0]
grid = AffineGrid(theta, self.grid_shape)
self.inputs = {'X': x, 'Grid': grid}
self.attrs = {
'use_cudnn': self.use_cudnn,
"align_corners": self.align_corners,
"padding_mode": self.padding_mode,
"mode": self.mode
}
self.outputs = {
'Output':
GridSampler(x, grid, self.align_corners, self.mode,
self.padding_mode)
}
if len(self.grid_shape) == 4:
for i in range(self.theta_shape[0]):
for j in range(2):
for k in range(3):
theta[i, j, k] = np.random.rand(1)[0]
grid = AffineGrid(theta, self.grid_shape)
self.inputs = {'X': x, 'Grid': grid}
self.attrs = {
'use_cudnn': self.use_cudnn,
"align_corners": self.align_corners,
"padding_mode": self.padding_mode,
"mode": self.mode
}
self.outputs = {
'Output':
GridSampler(x, grid, self.align_corners, self.mode,
self.padding_mode)
}
else:
for i in range(self.theta_shape[0]):
for j in range(3):
for k in range(4):
theta[i, j, k] = np.random.rand(1)[0]
grid = AffineGrid3D(theta, self.grid_shape)
self.inputs = {'X': x, 'Grid': grid}
self.attrs = {
'use_cudnn': self.use_cudnn,
"align_corners": self.align_corners,
"padding_mode": self.padding_mode,
"mode": self.mode
}
self.outputs = {
'Output':
GridSampler3D(x, grid, self.align_corners, self.mode,
self.padding_mode)
}
def get_places(self):
places = []
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
return places
def test_check_output(self):
self.check_output(check_eager=True)
if len(self.grid_shape) == 4:
self.check_output(check_eager=True)
else:
check_eager_flag = True
check_dygraph_flag = False
for place in self.get_places():
res = self.check_output_with_place(
place,
atol=1e-5,
check_dygraph=check_dygraph_flag,
check_eager=check_eager_flag)
if check_eager_flag:
assert check_dygraph_flag == False
outs, eager_dygraph_outs, fetch_list = res
elif check_dygraph_flag:
uts, dygraph_outs, fetch_list = res
else:
outs, fetch_list = res
if self.op_type not in compile_vs_runtime_white_list.COMPILE_RUN_OP_WHITE_LIST:
self.check_compile_vs_runtime(fetch_list, outs)
def test_check_grad_normal(self):
self.check_grad(['X', 'Grid'],
'Output',
max_relative_error=0.01,
numeric_grad_delta=self.numeric_grad_delta,
check_eager=True)
if len(self.grid_shape) == 4:
self.check_grad(['X', 'Grid'],
'Output',
max_relative_error=0.01,
numeric_grad_delta=self.numeric_grad_delta,
check_eager=True)
else:
self._check_grad_helper()
for place in self.get_places():
self.check_grad_with_place(
place, ['X'],
'Output',
numeric_grad_delta=self.numeric_grad_delta,
max_relative_error=0.01,
check_eager=True,
check_dygraph=False)
def initTestCase(self):
self.x_shape = (2, 3, 8, 8)
......@@ -285,5 +484,102 @@ class Case5(LargeInputCase):
self.use_cudnn = False if core.is_compiled_with_rocm() else True
class Case6(TestGridSamplerOp):
def initTestCase(self):
self.x_shape = (2, 3, 5, 6, 7)
self.grid_shape = (2, 8, 9, 10, 3)
self.theta_shape = (2, 3, 4)
self.align_corners = False
self.padding_mode = "zeros"
self.mode = "bilinear"
class Case6_(TestGridSamplerOp):
def get_places(self):
places = []
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
return places
def initTestCase(self):
self.x_shape = (2, 3, 5, 6, 7)
self.grid_shape = (2, 8, 9, 10, 3)
self.theta_shape = (2, 3, 4)
self.align_corners = False
self.padding_mode = "border"
self.mode = "bilinear"
class Case7(TestGridSamplerOp):
def initTestCase(self):
self.x_shape = (2, 3, 5, 6, 7)
self.grid_shape = (2, 8, 9, 10, 3)
self.theta_shape = (2, 3, 4)
self.align_corners = False
self.padding_mode = "reflection"
self.mode = "bilinear"
class Case8(TestGridSamplerOp):
def initTestCase(self):
self.x_shape = (2, 3, 5, 6, 7)
self.grid_shape = (2, 8, 9, 10, 3)
self.theta_shape = (2, 3, 4)
self.align_corners = True
self.padding_mode = "reflection"
self.mode = "bilinear"
class Case9(TestGridSamplerOp):
def initTestCase(self):
self.x_shape = (2, 3, 5, 6, 7)
self.grid_shape = (2, 8, 9, 10, 3)
self.theta_shape = (2, 3, 4)
self.align_corners = False
self.padding_mode = "reflection"
self.mode = "nearest"
self.numeric_grad_delta = 0.0001
@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
"however it is desirable to cover the forward pass")
class LargeInput3DCase(TestGridSamplerOp):
def initTestCase(self):
self.no_need_check_grad = True
self.x_shape = (2, 3, 24, 24, 12)
self.grid_shape = (2, 25, 25, 12, 3)
self.theta_shape = (2, 3, 4)
self.align_corners = False
self.padding_mode = "reflection"
self.mode = "bilinear"
self.use_cudnn = False
self.__class__.op_type = 'grid_sampler'
def test_check_grad_normal(self):
pass
@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
"however it is desirable to cover the forward pass")
class Case10(LargeInput3DCase):
def initTestCase(self):
self.no_need_check_grad = True
self.x_shape = (2, 3, 24, 24, 12)
self.grid_shape = (2, 25, 25, 12, 3)
self.theta_shape = (2, 3, 4)
self.align_corners = True
self.padding_mode = "zeros"
self.mode = "bilinear"
self.use_cudnn = False
self.__class__.op_type = 'grid_sampler'
if __name__ == "__main__":
unittest.main()
......@@ -127,12 +127,21 @@ def grid_sample(x,
"""
This operation samples input X by using bilinear interpolation or
nearest interpolation based on flow field grid, which is usually
generated by :code:`affine_grid` . The grid of shape [N, H, W, 2]
is the concatenation of (x, y) coordinates with shape [N, H, W] each,
where x is indexing the 4th dimension (in width dimension) of input
data x and y is indexing the 3rd dimension (in height dimension),
finally results is the bilinear interpolation or nearest value of 4 nearest corner
points. The output tensor shape will be [N, C, H, W].
generated by :code:`affine_grid` . When the input X is 4-D Tensor,
the grid of shape [N, H, W, 2] is the concatenation of (x, y)
coordinates with shape [N, H, W] each, where x is indexing the 4th
dimension (in width dimension) of input data x and y is indexing
the 3rd dimension (in height dimension), finally results is the
bilinear interpolation or nearest value of 4 nearest corner
points. The output tensor shape will be [N, C, H, W]. When the input X
is 5-D Tensor, the grid of shape [N, D, H, W, 3] is the concatenation
of (x, y, z) coordinates with shape [N, D, H, W] each, where x is
indexing the 5th dimension (in width dimension) of input data x, y is
indexing the 4th dimension (in height dimension) and z is indexing the
3rd dimension (in depth dimension) finally results is the bilinear
interpolation or nearest value of 8 nearest cornerpoints. The output
tensor shape will be [N, C, D, H, W].
Step 1:
......@@ -181,11 +190,13 @@ def grid_sample(x,
Args:
x(Tensor): The input tensor, which is a 4-d tensor with shape
[N, C, H, W], N is the batch size, C is the channel
number, H and W is the feature height and width.
[N, C, H, W] or a 5-d tensor with shape [N, C, D, H, W],
N is the batch size, C is the channel number,
D, H and W is the feature depth, height and width.
The data type is float32 or float64.
grid(Tensor): Input grid tensor of shape [N, grid_H, grid_W, 2]. The
data type is float32 or float64.
grid(Tensor): Input grid tensor, which is a 4-d tensor with shape [N, grid_H,
grid_W, 2] or a 5-d tensor with shape [N, grid_D, grid_H,
grid_W, 3]. The data type is float32 or float64.
mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
Default: 'bilinear'.
padding_mode(str, optional) The padding method used when source index
......@@ -199,7 +210,8 @@ def grid_sample(x,
None by default.
Returns:
Tensor, The shape of output is [N, C, grid_H, grid_W] in which `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
Tensor, The shape of output is [N, C, grid_H, grid_W] or [N, C, grid_D, grid_H, grid_W] in which `grid_D` is the depth of grid,
`grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
Examples:
......@@ -207,31 +219,24 @@ def grid_sample(x,
import paddle
import paddle.nn.functional as F
import numpy as np
# shape=[1, 1, 3, 3]
x = np.array([[[[-0.6, 0.8, -0.5],
[-0.5, 0.2, 1.2],
[ 1.4, 0.3, -0.2]]]]).astype("float64")
# x shape=[1, 1, 3, 3]
x = paddle.to_tensor([[[[-0.6, 0.8, -0.5],
[-0.5, 0.2, 1.2],
[ 1.4, 0.3, -0.2]]]],dtype='float64')
# grid shape = [1, 3, 4, 2]
grid = np.array(
[[[[ 0.2, 0.3],
[-0.4, -0.3],
[-0.9, 0.3],
[-0.9, -0.6]],
[[ 0.4, 0.1],
[ 0.9, -0.8],
[ 0.4, 0.5],
[ 0.5, -0.2]],
[[ 0.1, -0.8],
[-0.3, -1. ],
[ 0.7, 0.4],
[ 0.2, 0.8]]]]).astype("float64")
x = paddle.to_tensor(x)
grid = paddle.to_tensor(grid)
grid = paddle.to_tensor([[[[ 0.2, 0.3],
[-0.4, -0.3],
[-0.9, 0.3],
[-0.9, -0.6]],
[[ 0.4, 0.1],
[ 0.9, -0.8],
[ 0.4, 0.5],
[ 0.5, -0.2]],
[[ 0.1, -0.8],
[-0.3, -1. ],
[ 0.7, 0.4],
[ 0.2, 0.8]]]],dtype='float64')
y_t = F.grid_sample(
x,
grid,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册