test=release/1.5 (#18134)

cherry pick for deform roi pooling

test=release/1.5 (#18134)
cherry pick for deform roi pooling
c50fb58c · cjt222 · GitHub · 1810bfb4 · c50fb58c · c50fb58c
7 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -238,6 +238,7 @@ paddle.fluid.layers.continuous_value_model (ArgSpec(args=['input', 'cvm', 'use_c
 paddle.fluid.layers.where (ArgSpec(args=['condition'], varargs=None, keywords=None, defaults=None), ('document', '3126e3039e752ce26077f1efaca355c6'))
 paddle.fluid.layers.sign (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'ccf6bb7912afd2818d24bc45461e807a'))
 paddle.fluid.layers.deformable_conv (ArgSpec(args=['input', 'offset', 'mask', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'deformable_groups', 'im2col_step', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, None, None, None)), ('document', 'c896b66265a60bd3c5510f66e6e02919'))
+paddle.fluid.layers.deformable_roi_pooling (ArgSpec(args=['input', 'rois', 'trans', 'no_trans', 'spatial_scale', 'group_size', 'pooled_height', 'pooled_width', 'part_size', 'sample_per_part', 'trans_std', 'position_sensitive', 'name'], varargs=None, keywords=None, defaults=(False, 1.0, [1, 1], 1, 1, None, 1, 0.1, False, None)), ('document', '65b8dbe13e00c4dc8224652f6ff89540'))
 paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '9e87163ba32003f21d2c9d8c6a605ada'))
 paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'dce69a78638da8f7ad80b1fc00ed2029'))
 paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '32181f6037e387fb6e68a5beaafe33b6'))

--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
+#include <iostream>
+#include <memory>
+#include <vector>
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+class DeformablePSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input",
+             "(Tensor), "
+             "the input of Deformable PSROIPooling. "
+             "The shape of input tensor is [N,C,H,W]. Where N is batch size, "
+             "C is number of input channels, "
+             "H is height of the feature, and "
+             "W is the width of the feature.");
+    AddInput("ROIs",
+             "(LoDTensor), "
+             "ROIs (Regions of Interest) to pool over. "
+             "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+             "given as [[x1, y1, x2, y2], ...]. "
+             "(x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates.");
+    AddInput("Trans",
+             "(Tensor),"
+             "offset of features on ROIs while pooling. "
+             "The format is NCHW, where N is number of ROIs, "
+             "C is number of channels, which indicate the offset distance "
+             "in the x and y directions, "
+             "H is pooled height, and "
+             "W is pooled width.");
+    AddAttr<bool>("no_trans",
+                  "(bool), "
+                  "whether add offset to get new value or not while roi "
+                  "pooling, which value is True or False");
+    AddAttr<float>("spatial_scale",
+                   "(float), "
+                   "ratio of input feature map height (or width) to "
+                   "raw image height (or width). Equals the reciprocal "
+                   "of total stride in convolutional layers.");
+    AddAttr<int>("output_dim",
+                 "(int), "
+                 "the number of output channels, which should be less than "
+                 "input channels. Deformable roi_pooling requires "
+                 "output_channels = input_channels, while deformable "
+                 "psroi_pooling requires output_channels = input_channels "
+                 "* pooled_height * pooled_width");
+    AddAttr<std::vector<int>>(
+        "group_size",
+        "(vector<int>), "
+        "the number of groups which input channels are divided."
+        "(eg.number of input channels is k1*k2*(C+1), which k1 and k2 "
+        "are group width and height and C+1 is number of output "
+        "chanels. eg.(4, 6), which 4 is height of group and 6 is "
+        "width of group");
+    AddAttr<int>("pooled_height",
+                 "(int), "
+                 "the pooled output height.");
+    AddAttr<int>("pooled_width",
+                 "(int), "
+                 "the pooled output width.");
+    AddAttr<std::vector<int>>(
+        "part_size",
+        "(vector<int>), "
+        "the height and width of offset, eg.(4, 6), which height is 4 "
+        " and width is 6");
+    AddAttr<int>("sample_per_part",
+                 "(int), "
+                 "the number of samples in each bin");
+    AddAttr<float>("trans_std",
+                   "(float), "
+                   "Coefficient of offset");
+    AddOutput("TopCount",
+              "(Tensor), "
+              "record the number of pixel in average pooling to in each bin. "
+              "The format is NCHW, where N is the number of ROIs, "
+              "C is the number of output channels, "
+              "H is the height of output, and "
+              "W is the width of output.");
+    AddOutput("Output",
+              "(Tensor), "
+              "the output of Deformable PSROIPooling. "
+              "The format is NCHW, where N is the number of ROIs, "
+              "C is the number of output channels, "
+              "H is the height of output, and "
+              "W is thewidth of output. ");
+    AddComment(R"DOC(
+**DeformablePSROIPooling Operator**
+DeformablePSROIPooling is a new method based Region of interest pooling 
+(also known as RoI pooling).
+The operator has four steps:
+
+1. Dividing each region proposal into equal-sized sections with
+   the pooled_width and pooled_height.
+
+2. Add offset to pixel in ROI to get new location and the new value which are
+   computed directly through bilinear interpolation with four nearest pixel.
+
+3. Sample several points to get average values in each bin.
+
+4. Copying these average values to the output buffer.
+
+DeformablePSROIPooling is part of Deformable Convolutional Networks,
+please refer to https://arxiv.org/abs/1703.06211 for more details.
+    )DOC");
+  }
+};
+
+class DeformablePSROIPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of DeformablePSROIPoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of DeformablePSROIPoolOp "
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Trans"),
+                   "Input(Trans) of DeformablePSROIPoolOp "
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output(Output) of DeformablePSROIPoolOp "
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("TopCount"),
+                   "Output(TopCount) of DeformablePSROIPoolOp "
+                   "should not be null.");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+    auto trans_dims = ctx->GetInputDim("Trans");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
+                   "given as [[ x1, y1, x2, y2], ...].");
+    PADDLE_ENFORCE(trans_dims.size() == 4,
+                   "The format of Input Trans is (N, 2, H, W).");
+    auto pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    auto pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    auto spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+    auto output_channels = ctx->Attrs().Get<int>("output_dim");
+    auto group_size = ctx->Attrs().Get<std::vector<int>>("group_size");
+    auto group_height = group_size[0];
+    auto group_width = group_size[1];
+    auto part_size = ctx->Attrs().Get<std::vector<int>>("part_size");
+    auto part_height = part_size[0];
+    auto part_width = part_size[1];
+    auto sample_per_part = ctx->Attrs().Get<int>("sample_per_part");
+    auto trans_std = ctx->Attrs().Get<float>("trans_std");
+    PADDLE_ENFORCE(trans_std >= 0.0f, "trans_std must greater than 0.0");
+    PADDLE_ENFORCE(input_dims[1] >= output_channels,
+                   "input channels must greater than out_channels");
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled height must greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0, "The pooled width must greater than 0");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0");
+    PADDLE_ENFORCE_EQ(group_size.size(), 2,
+                      "The size of group_size should be 2.");
+    PADDLE_ENFORCE_GT(group_height, 0,
+                      "The group_height in group_size must greater than 0");
+    PADDLE_ENFORCE_GT(group_width, 0,
+                      "The group_width in group_size must greater than 0");
+    PADDLE_ENFORCE_EQ(part_size.size(), 2,
+                      "The size of part_size should be 2.");
+    PADDLE_ENFORCE_GT(part_height, 0,
+                      "The part_height in part_size must greater than 0");
+    PADDLE_ENFORCE_GT(part_width, 0,
+                      "The part_width in part_size must greater than 0");
+    PADDLE_ENFORCE(part_height <= trans_dims[2],
+                   "The height of trans must greater than part_height");
+    PADDLE_ENFORCE(part_width <= trans_dims[3],
+                   "The width of trans must greater than part_width");
+    PADDLE_ENFORCE_GT(sample_per_part, 0,
+                      "The sample_per_part must greater than 0");
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] = output_channels;
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+    ctx->SetOutputDim("Output", out_dims);
+    ctx->SetOutputDim("TopCount", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                   ctx.device_context());
+  }
+};
+
+class DeformablePSROIPoolGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("deformable_psroi_pooling_grad");
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("Trans", Input("Trans"));
+    op->SetInput("ROIs", Input("ROIs"));
+    op->SetInput("TopCount", Output("TopCount"));
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("Trans"), InputGrad("Trans"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+class DeformablePSROIPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Output")),
+                   "The gradient of Output should not be null.");
+    if (ctx->HasOutput(framework::GradVarName("Input"))) {
+      ctx->SetOutputDim(framework::GradVarName("Input"),
+                        ctx->GetInputDim("Input"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Trans"))) {
+      ctx->SetOutputDim(framework::GradVarName("Trans"),
+                        ctx->GetInputDim("Trans"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("Trans")->type(),
+                                   ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+REGISTER_OPERATOR(deformable_psroi_pooling, ops::DeformablePSROIPoolOp,
+                  ops::DeformablePSROIPoolOpMaker,
+                  ops::DeformablePSROIPoolGradOpDescMaker);
+REGISTER_OPERATOR(deformable_psroi_pooling_grad,
+                  ops::DeformablePSROIPoolGradOp);
+REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling,
+                       ops::DeformablePSROIPoolCPUKernel<CPU, float>,
+                       ops::DeformablePSROIPoolCPUKernel<CPU, double>);
+REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling_grad,
+                       ops::DeformablePSROIPoolGradCPUKernel<CPU, float>,
+                       ops::DeformablePSROIPoolGradCPUKernel<CPU, double>);
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdio.h>
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+static inline int GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__device__ T bilinear_interpolation(const T* data, const T x, const T y,
+                                    const int width, const int height) {
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  T dist_x = static_cast<T>(x - x1);
+  T dist_y = static_cast<T>(y - y1);
+  T value11 = data[y1 * width + x1];
+  T value12 = data[y2 * width + x1];
+  T value21 = data[y1 * width + x2];
+  T value22 = data[y2 * width + x2];
+  T value = (1 - dist_x) * (1 - dist_y) * value11 +
+            (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 +
+            dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename T>
+__global__ void DeformablePSROIPoolForwardKernel(
+    const int count, const T* bottom_data, const T spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const T* bottom_rois,
+    const T* bottom_trans, const bool no_trans, const T trans_std,
+    const int sample_per_part, const int output_dim, const int group_height,
+    const int group_width, const int part_height, const int part_width,
+    const int num_classes, const int channels_each_class, T* top_data,
+    T* top_count, int* roi_batch_id_data) {
+  CUDA_KERNEL_LOOP(index, count) {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+    const T* offset_bottom_rois = bottom_rois + n * 4;
+    int roi_batch_ind = roi_batch_id_data[n];
+
+    // location of roi on feature map
+    T roi_start_w =
+        static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
+    T roi_start_h =
+        static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_end_w =
+        static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h =
+        static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+
+    // width and height of roi
+    T roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // width and height of each bin
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    // sampling interval ineach bin
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    // obtain offset of roi
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_width);
+    int class_id = ctop / channels_each_class;
+
+    T trans_x =
+        no_trans
+            ? static_cast<T>(0)
+            : bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
+                            part_h) *
+                               part_width +
+                           part_w] *
+                  static_cast<T>(trans_std);
+    T trans_y = no_trans
+                    ? static_cast<T>(0)
+                    : bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
+                                        part_height +
+                                    part_h) *
+                                       part_width +
+                                   part_w] *
+                          static_cast<T>(trans_std);
+
+    // location of start after adding offset
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+    T sum = 0;
+    int count = 0;
+    int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
+    gw = min(max(gw, 0), group_width - 1);
+    gh = min(max(gh, 0), group_height - 1);
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels) * height * width;
+
+    // sampling in each bin
+    for (int ih = 0; ih < sample_per_part; ih++) {
+      for (int iw = 0; iw < sample_per_part; iw++) {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_height + gh) * group_width + gw;
+        // bilinear interpolation
+        T val = bilinear_interpolation(offset_bottom_data + c * height * width,
+                                       w, h, width, height);
+        sum += val;
+        count++;
+      }
+    }
+    top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
+    top_count[index] = count;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const LoDTensor* rois = ctx.Input<LoDTensor>("ROIs");
+    const Tensor* trans = ctx.Input<Tensor>("Trans");
+    Tensor* out = ctx.Output<Tensor>("Output");
+    out->mutable_data<T>(ctx.GetPlace());
+    Tensor* top_count = ctx.Output<Tensor>("TopCount");
+    top_count->mutable_data<T>(ctx.GetPlace());
+
+    auto no_trans = ctx.Attr<bool>("no_trans");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto output_dim = ctx.Attr<int>("output_dim");
+    auto group_size = ctx.Attr<std::vector<int>>("group_size");
+    auto group_height = group_size[0];
+    auto group_width = group_size[1];
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto part_size = ctx.Attr<std::vector<int>>("part_size");
+    auto part_height = part_size[0];
+    auto part_width = part_size[1];
+    auto sample_per_part = ctx.Attr<int>("sample_per_part");
+    auto trans_std = ctx.Attr<float>("trans_std");
+
+    const int batch = static_cast<int>(input->dims()[0]);
+    const int channels = static_cast<int>(input->dims()[1]);
+    const int height = static_cast<int>(input->dims()[2]);
+    const int width = static_cast<int>(input->dims()[3]);
+    const int channels_trans = no_trans ? 2 : trans->dims()[1];
+    const int num_rois = rois->dims()[0];
+    PADDLE_ENFORCE_EQ(num_rois, out->dims()[0],
+                      "number of rois should be same with number of output");
+    const int count = num_rois * output_dim * pooled_height * pooled_width;
+    const int num_classes = no_trans ? 1 : channels_trans / 2;
+    const int channels_each_class =
+        no_trans ? output_dim : output_dim / num_classes;
+    PADDLE_ENFORCE(channels_each_class >= 1,
+                   "channels_each must greater than 1");
+
+    const T* bottom_data = input->data<T>();
+    const T* bottom_rois = rois->data<T>();
+    const T* bottom_trans = no_trans ? NULL : trans->data<T>();
+
+    framework::Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({num_rois});
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch,
+        "The rois_batch_size and imgs batch_size must be the same.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
+                      "The rois_num from input and lod must be the same.");
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = allocator.Allocate(bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
+
+    T* top_data = out->mutable_data<T>(ctx.GetPlace());
+    T* top_count_data = top_count->mutable_data<T>(ctx.GetPlace());
+
+    DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0,
+                                       dev_ctx.stream()>>>(
+        count, bottom_data, (T)spatial_scale, channels, height, width,
+        pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans,
+        (T)trans_std, sample_per_part, output_dim, group_height, group_width,
+        part_height, part_width, num_classes, channels_each_class, top_data,
+        top_count_data, roi_id_data);
+  }
+};
+
+template <typename T>
+__global__ void DeformablePSROIPoolBackwardAccKernel(
+    const int count, const T* top_diff, const T* top_count, const int num_rois,
+    const T spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int output_dim, T* bottom_data_diff, T* bottom_trans_diff,
+    const T* bottom_data, const T* bottom_rois, const T* bottom_trans,
+    const bool no_trans, const T trans_std, const int sample_per_part,
+    const int group_height, const int group_width, const int part_height,
+    const int part_width, const int num_classes, const int channels_each_class,
+    int* roi_batch_id_data) {
+  CUDA_KERNEL_LOOP(index, count) {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+    int num_box = count / pooled_height / pooled_width / output_dim;
+    const T* offset_bottom_rois = bottom_rois + n * 4;
+    int roi_batch_ind = roi_batch_id_data[n];
+
+    // location of roi on feature map
+    T roi_start_w =
+        static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
+    T roi_start_h =
+        static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_end_w =
+        static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h =
+        static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+
+    // width and height of roi
+    T roi_width = max(roi_end_w - roi_start_w, 0.1);
+    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // width and height of each bin
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    // sampling interval in each bin
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    // obtain offset of roi
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_width);
+    int class_id = ctop / channels_each_class;
+
+    T trans_x =
+        no_trans
+            ? static_cast<T>(0)
+            : bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
+                            part_h) *
+                               part_width +
+                           part_w] *
+                  static_cast<T>(trans_std);
+    T trans_y = no_trans
+                    ? static_cast<T>(0)
+                    : bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
+                                        part_height +
+                                    part_h) *
+                                       part_width +
+                                   part_w] *
+                          static_cast<T>(trans_std);
+    // location of start after adding offset
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0) {
+      continue;
+    }
+
+    T diff_val = top_diff[index] / top_count[index];
+    const T* offset_bottom_data =
+        bottom_data + roi_batch_ind * channels * height * width;
+    int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
+    gw = min(max(gw, 0), group_width - 1);
+    gh = min(max(gh, 0), group_height - 1);
+
+    // sampling in each bin
+    for (int ih = 0; ih < sample_per_part; ih++) {
+      for (int iw = 0; iw < sample_per_part; iw++) {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_height + gh) * group_width + gw;
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+
+        // compute coefficient of gradient
+        T dist_x = w - x0, dist_y = h - y0;
+        T q00 = (1 - dist_x) * (1 - dist_y);
+        T q01 = (1 - dist_x) * dist_y;
+        T q10 = dist_x * (1 - dist_y);
+        T q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+
+        // compute gradient of input
+        if (bottom_data_diff) {
+          platform::CudaAtomicAdd(
+              bottom_data_diff + roi_batch_ind * channels * height * width +
+                  bottom_index_base + y0 * width + x0,
+              q00 * diff_val);
+          platform::CudaAtomicAdd(
+              bottom_data_diff + roi_batch_ind * channels * height * width +
+                  bottom_index_base + y1 * width + x0,
+              q01 * diff_val);
+          platform::CudaAtomicAdd(
+              bottom_data_diff + roi_batch_ind * channels * height * width +
+                  bottom_index_base + y0 * width + x1,
+              q10 * diff_val);
+          platform::CudaAtomicAdd(
+              bottom_data_diff + roi_batch_ind * channels * height * width +
+                  bottom_index_base + y1 * width + x1,
+              q11 * diff_val);
+        }
+
+        // compute gradient of trans
+        if (no_trans || bottom_trans_diff == NULL) {
+          continue;
+        }
+
+        T u00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        T u01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        T u10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        T u11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        T diff_x = (u11 * dist_y + u10 * (1 - dist_y) - u01 * dist_y -
+                    u00 * (1 - dist_y)) *
+                   trans_std * diff_val;
+        diff_x *= roi_width;
+        T diff_y = (u11 * dist_x + u01 * (1 - dist_x) - u10 * dist_x -
+                    u00 * (1 - dist_x)) *
+                   trans_std * diff_val;
+        diff_y *= roi_height;
+        platform::CudaAtomicAdd(
+            bottom_trans_diff +
+                (((n * num_classes + class_id) * 2) * part_height + part_h) *
+                    part_width +
+                part_w,
+            diff_x);
+        platform::CudaAtomicAdd(
+            bottom_trans_diff +
+                (((n * num_classes + class_id) * 2 + 1) * part_height +
+                 part_h) *
+                    part_width +
+                part_w,
+            diff_y);
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const LoDTensor* rois = ctx.Input<LoDTensor>("ROIs");
+    const Tensor* trans = ctx.Input<Tensor>("Trans");
+    const Tensor* top_count = ctx.Input<Tensor>("TopCount");
+    const Tensor* output_grad =
+        ctx.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* trans_grad = ctx.Output<Tensor>(framework::GradVarName("Trans"));
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.cuda_device_context();
+    if (input_grad) {
+      input_grad->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, input_grad, static_cast<T>(0));
+    }
+    if (trans_grad) {
+      trans_grad->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, trans_grad, static_cast<T>(0));
+    }
+
+    auto no_trans = ctx.Attr<bool>("no_trans");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto output_dim = ctx.Attr<int>("output_dim");
+    auto group_size = ctx.Attr<std::vector<int>>("group_size");
+    auto group_height = group_size[0];
+    auto group_width = group_size[1];
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto part_size = ctx.Attr<std::vector<int>>("part_size");
+    auto part_height = part_size[0];
+    auto part_width = part_size[1];
+    auto sample_per_part = ctx.Attr<int>("sample_per_part");
+    auto trans_std = ctx.Attr<float>("trans_std");
+
+    const int batch = static_cast<int>(input->dims()[0]);
+    const int channels = static_cast<int>(input->dims()[1]);
+    const int height = static_cast<int>(input->dims()[2]);
+    const int width = static_cast<int>(input->dims()[3]);
+    const int channels_trans = no_trans ? 2 : trans->dims()[1];
+    const int num_rois = rois->dims()[0];
+    const int count = num_rois * output_dim * pooled_height * pooled_width;
+    const int num_classes = no_trans ? 1 : channels_trans / 2;
+    const int channels_each_class =
+        no_trans ? output_dim : output_dim / num_classes;
+
+    const T* top_diff = output_grad->data<T>();
+    const T* bottom_data = input->data<T>();
+    const T* bottom_rois = rois->data<T>();
+    const T* bottom_trans = no_trans ? NULL : trans->data<T>();
+
+    T* bottom_data_diff = NULL;
+    T* bottom_trans_diff = NULL;
+    if (input_grad) {
+      bottom_data_diff = input_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    if (trans_grad) {
+      bottom_trans_diff =
+          no_trans ? NULL : trans_grad->mutable_data<T>(ctx.GetPlace());
+    }
+
+    const T* top_count_data = top_count->data<T>();
+    framework::Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({num_rois});
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch,
+        "The rois_batch_size and imgs batch_size must be the same.");
+
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
+                      "The rois_num from input and lod must be the same.");
+
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = allocator.Allocate(bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
+
+    DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS,
+                                           0, dev_ctx.stream()>>>(
+        count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels,
+        height, width, pooled_height, pooled_width, output_dim,
+        bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois,
+        bottom_trans, no_trans, (T)trans_std, sample_per_part, group_height,
+        group_width, part_height, part_width, num_classes, channels_each_class,
+        roi_id_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling,
+                        ops::DeformablePSROIPoolCUDAKernel<CUDA, float>,
+                        ops::DeformablePSROIPoolCUDAKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling_grad,
+                        ops::DeformablePSROIPoolGradCUDAKernel<CUDA, float>,
+                        ops::DeformablePSROIPoolGradCUDAKernel<CUDA, double>);
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+T bilinear_interp(const T* data, const T x, const T y, const int width,
+                  const int height) {
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  T dist_x = static_cast<T>(x - x1);
+  T dist_y = static_cast<T>(y - y1);
+  T value11 = data[y1 * width + x1];
+  T value12 = data[y2 * width + x1];
+  T value21 = data[y1 * width + x2];
+  T value22 = data[y2 * width + x2];
+  T value = (1 - dist_x) * (1 - dist_y) * value11 +
+            (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 +
+            dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename T>
+void DeformablePSROIPoolForwardCPUKernel(
+    const int count, const T* bottom_data, const T spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const T* bottom_rois,
+    const T* bottom_trans, const bool no_trans, const float trans_std,
+    const int sample_per_part, const int output_dim, const int group_height,
+    const int group_width, const int part_height, const int part_width,
+    const int num_classes, const int channels_each_class, T* top_data,
+    T* top_count, const int batch_size, int* roi_batch_id_data,
+    const LoDTensor* rois) {
+  for (int ix = 0; ix < count; ix++) {
+    int pw = ix % pooled_width;
+    int ph = (ix / pooled_width) % pooled_height;
+    int ctop = (ix / pooled_width / pooled_height) % output_dim;
+    int n = ix / pooled_width / pooled_height / output_dim;
+    const T* offset_bottom_rois = bottom_rois + n * 4;
+
+    int roi_batch_ind = roi_batch_id_data[n];
+    T roi_start_w =
+        static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
+    T roi_start_h =
+        static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_end_w =
+        static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h =
+        static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+
+    //  width and height of roi
+    T roi_width = std::max(roi_end_w - roi_start_w, T(0.1));
+    T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
+
+    //  width and height of each bin
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    //  sampling interval in each bin
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    //  obtain offset of roi
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_width);
+    int class_id = ctop / channels_each_class;
+
+    T trans_x =
+        no_trans
+            ? static_cast<T>(0)
+            : bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
+                            part_h) *
+                               part_width +
+                           part_w] *
+                  static_cast<T>(trans_std);
+    T trans_y = no_trans
+                    ? static_cast<T>(0)
+                    : bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
+                                        part_height +
+                                    part_h) *
+                                       part_width +
+                                   part_w] *
+                          static_cast<T>(trans_std);
+
+    //  location of start after adding offset
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+    T sum = 0;
+    int num_sample = 0;
+    int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
+    gw = std::min(std::max(gw, 0), group_width - 1);
+    gh = std::min(std::max(gh, 0), group_height - 1);
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels) * height * width;
+
+    //  sampling in each bin
+    for (int ih = 0; ih < sample_per_part; ih++) {
+      for (int iw = 0; iw < sample_per_part; iw++) {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
+          continue;
+        }
+        w = std::min(std::max(w, T(0.)), T(width - 1.));
+        h = std::min(std::max(h, T(0.)), height - T(1.));
+        int c = (ctop * group_height + gh) * group_width + gw;
+        // bilinear interpolation to get value
+        T val = bilinear_interp(offset_bottom_data + c * height * width, w, h,
+                                width, height);
+        sum += val;
+        num_sample++;
+      }
+    }
+    top_data[ix] = num_sample == 0 ? static_cast<T>(0) : sum / num_sample;
+    top_count[ix] = num_sample;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
+    auto* trans = ctx.Input<Tensor>("Trans");
+    auto* out = ctx.Output<Tensor>("Output");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto* top_count = ctx.Output<Tensor>("TopCount");
+    top_count->mutable_data<T>(ctx.GetPlace());
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    set_zero(dev_ctx, out, static_cast<T>(0));
+    set_zero(dev_ctx, top_count, static_cast<T>(0));
+
+    const int num_rois = rois->dims()[0];
+    PADDLE_ENFORCE_EQ(num_rois, out->dims()[0],
+                      "number of rois should be same with number of output");
+
+    framework::Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({num_rois});
+    int* roi_batch_id_data =
+        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
+    auto no_trans = ctx.Attr<bool>("no_trans");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto output_dim = ctx.Attr<int>("output_dim");
+    auto group_size = ctx.Attr<std::vector<int>>("group_size");
+    auto group_height = group_size[0];
+    auto group_width = group_size[1];
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto part_size = ctx.Attr<std::vector<int>>("part_size");
+    auto part_height = part_size[0];
+    auto part_width = part_size[1];
+    auto sample_per_part = ctx.Attr<int>("sample_per_part");
+    auto trans_std = ctx.Attr<float>("trans_std");
+
+    int batch = static_cast<int>(input->dims()[0]);
+    int channels = static_cast<int>(input->dims()[1]);
+    int height = static_cast<int>(input->dims()[2]);
+    int width = static_cast<int>(input->dims()[3]);
+    int channels_trans = no_trans ? 2 : trans->dims()[1];
+    auto count = num_rois * output_dim * pooled_height * pooled_width;
+    auto num_classes = no_trans ? 1 : channels_trans / 2;
+    auto channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+    PADDLE_ENFORCE(channels_each_class >= 1,
+                   "channels_each must greater than 1");
+
+    const T* bottom_data = input->data<T>();
+    const T* bottom_rois = rois->data<T>();
+    const T* bottom_trans = no_trans ? NULL : trans->data<T>();
+
+    T* top_data = out->mutable_data<T>(ctx.GetPlace());
+    T* top_count_data = top_count->mutable_data<T>(ctx.GetPlace());
+
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(rois_batch_size, batch,
+                      "The rois_batch_size must equal to batch_size of img.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
+                      "The rois_num from input and lod must be the same.");
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+
+    DeformablePSROIPoolForwardCPUKernel(
+        count, bottom_data, (T)spatial_scale, channels, height, width,
+        pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans,
+        trans_std, sample_per_part, output_dim, group_height, group_width,
+        part_height, part_width, num_classes, channels_each_class, top_data,
+        top_count_data, batch, roi_batch_id_data, rois);
+  }
+};
+
+template <typename T>
+void DeformablePSROIPoolBackwardAccCPUKernel(
+    const int count, const T* top_diff, const T* top_count, const int num_rois,
+    const T spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int output_dim, T* bottom_data_diff, T* bottom_trans_diff,
+    const T* bottom_data, const T* bottom_rois, const T* bottom_trans,
+    const bool no_trans, const float trans_std, const int sample_per_part,
+    const int group_height, const int group_width, const int part_height,
+    const int part_width, const int num_classes, const int channels_each_class,
+    const int batch_size, int* roi_batch_id_data, const LoDTensor* rois) {
+  for (int index = 0; index < count; index++) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    //  location of roi on feature map
+    const T* offset_bottom_rois = bottom_rois + n * 4;
+    int roi_batch_ind = roi_batch_id_data[n];
+    T roi_start_w =
+        static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
+    T roi_start_h =
+        static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_end_w =
+        static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h =
+        static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+
+    //  width and height of roi
+    T roi_width = std::max(roi_end_w - roi_start_w, T(0.1));
+    T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
+
+    //  width and height of each bin
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    //  sampling interval in each bin
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    //  obtain offset of roi
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_height);
+    int class_id = ctop / channels_each_class;
+
+    T trans_x =
+        no_trans
+            ? static_cast<T>(0)
+            : bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
+                            part_h) *
+                               part_width +
+                           part_w] *
+                  static_cast<T>(trans_std);
+    T trans_y = no_trans
+                    ? static_cast<T>(0)
+                    : bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
+                                        part_height +
+                                    part_h) *
+                                       part_width +
+                                   part_w] *
+                          static_cast<T>(trans_std);
+
+    //  location of start after adding offset
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0) {
+      continue;
+    }
+
+    T diff_val = top_diff[index] / top_count[index];
+    const T* offset_bottom_data =
+        bottom_data + roi_batch_ind * channels * height * width;
+    int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
+    gw = std::min(std::max(gw, 0), group_width - 1);
+    gh = std::min(std::max(gh, 0), group_height - 1);
+
+    //  sampling in each bin
+    for (int ih = 0; ih < sample_per_part; ih++) {
+      for (int iw = 0; iw < sample_per_part; iw++) {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
+          continue;
+        }
+        w = std::min(std::max(w, T(0.)), T(width - 1.));
+        h = std::min(std::max(h, T(0.)), T(height - 1.));
+        int c = (ctop * group_height + gh) * group_width + gw;
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+
+        //  compute coefficient of gradient
+        T dist_x = w - x0, dist_y = h - y0;
+        T q00 = (1 - dist_x) * (1 - dist_y);
+        T q01 = (1 - dist_x) * dist_y;
+        T q10 = dist_x * (1 - dist_y);
+        T q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+
+        //  compute gradient of input
+        if (bottom_data_diff != NULL) {
+          T* offset_bottom_data_diff_addr00 =
+              bottom_data_diff + roi_batch_ind * channels * height * width +
+              bottom_index_base + y0 * width + x0;
+          T* offset_bottom_data_diff_addr01 =
+              bottom_data_diff + roi_batch_ind * channels * height * width +
+              bottom_index_base + y1 * width + x0;
+          T* offset_bottom_data_diff_addr10 =
+              bottom_data_diff + roi_batch_ind * channels * height * width +
+              bottom_index_base + y0 * width + x1;
+          T* offset_bottom_data_diff_addr11 =
+              bottom_data_diff + roi_batch_ind * channels * height * width +
+              bottom_index_base + y1 * width + x1;
+          *offset_bottom_data_diff_addr00 =
+              *offset_bottom_data_diff_addr00 + q00 * diff_val;
+          *offset_bottom_data_diff_addr01 =
+              *offset_bottom_data_diff_addr01 + q01 * diff_val;
+          *offset_bottom_data_diff_addr10 =
+              *offset_bottom_data_diff_addr10 + q10 * diff_val;
+          *offset_bottom_data_diff_addr11 =
+              *offset_bottom_data_diff_addr11 + q11 * diff_val;
+        }
+
+        //  compute gradient of trans
+        if (no_trans || bottom_trans_diff == NULL) {
+          continue;
+        }
+
+        T u00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        T u01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        T u10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        T u11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+
+        T diff_x = (u11 * dist_y + u10 * (1 - dist_y) - u01 * dist_y -
+                    u00 * (1 - dist_y)) *
+                   trans_std * diff_val;
+        diff_x *= roi_width;
+        T diff_y = (u11 * dist_x + u01 * (1 - dist_x) - u10 * dist_x -
+                    u00 * (1 - dist_x)) *
+                   trans_std * diff_val;
+        diff_y *= roi_height;
+        T* offset_bottom_trans_diff_x =
+            bottom_trans_diff +
+            (((n * num_classes + class_id) * 2) * part_height + part_h) *
+                part_width +
+            part_w;
+        T* offset_bottom_trans_diff_y =
+            bottom_trans_diff +
+            (((n * num_classes + class_id) * 2 + 1) * part_height + part_h) *
+                part_width +
+            part_w;
+
+        *offset_bottom_trans_diff_x = *offset_bottom_trans_diff_x + diff_x;
+        *offset_bottom_trans_diff_y = *offset_bottom_trans_diff_y + diff_y;
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
+    auto* trans = ctx.Input<Tensor>("Trans");
+    auto* top_count = ctx.Input<Tensor>("TopCount");
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    if (input_grad) {
+      input_grad->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, input_grad, static_cast<T>(.0));
+    }
+    auto* trans_grad = ctx.Output<Tensor>(framework::GradVarName("Trans"));
+    if (trans_grad) {
+      trans_grad->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, trans_grad, static_cast<T>(.0));
+    }
+    auto no_trans = ctx.Attr<bool>("no_trans");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto output_dim = ctx.Attr<int>("output_dim");
+    auto group_size = ctx.Attr<std::vector<int>>("group_size");
+    auto group_height = group_size[0];
+    auto group_width = group_size[1];
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto part_size = ctx.Attr<std::vector<int>>("part_size");
+    auto part_height = part_size[0];
+    auto part_width = part_size[1];
+    auto sample_per_part = ctx.Attr<int>("sample_per_part");
+    auto trans_std = ctx.Attr<float>("trans_std");
+
+    const int batch = static_cast<int>(input->dims()[0]);
+    const int channels = static_cast<int>(input->dims()[1]);
+    const int height = static_cast<int>(input->dims()[2]);
+    const int width = static_cast<int>(input->dims()[3]);
+    const int channels_trans = no_trans ? 2 : trans->dims()[1];
+    const int num_rois = rois->dims()[0];
+    const int count = num_rois * output_dim * pooled_height * pooled_width;
+    const int num_classes = no_trans ? 1 : channels_trans / 2;
+    const int channels_each_class =
+        no_trans ? output_dim : output_dim / num_classes;
+    Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({num_rois});
+    int* roi_batch_id_data =
+        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
+
+    const T* top_diff = output_grad->data<T>();
+    const T* bottom_data = input->data<T>();
+    const T* bottom_rois = rois->data<T>();
+    const T* bottom_trans = no_trans ? NULL : trans->data<T>();
+
+    T* bottom_data_diff = NULL;
+    T* bottom_trans_diff = NULL;
+    if (input_grad) {
+      bottom_data_diff = input_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    if (trans_grad) {
+      bottom_trans_diff =
+          no_trans ? NULL : trans_grad->mutable_data<T>(ctx.GetPlace());
+    }
+
+    const T* top_count_data = top_count->data<T>();
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
+                      "The rois_num from input and lod must be the same.");
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+
+    DeformablePSROIPoolBackwardAccCPUKernel(
+        count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels,
+        height, width, pooled_height, pooled_width, output_dim,
+        bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois,
+        bottom_trans, no_trans, (T)trans_std, sample_per_part, group_height,
+        group_width, part_height, part_width, num_classes, channels_each_class,
+        batch, roi_batch_id_data, rois);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -203,6 +203,7 @@ __all__ = [
    'where',
    'sign',
    'deformable_conv',
+    'deformable_roi_pooling',
 ]

 kIgnoreIndex = -100
@@ -12088,3 +12089,117 @@ def deformable_conv(input,

    output = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
    return output
+
+
+def deformable_roi_pooling(input,
+                           rois,
+                           trans,
+                           no_trans=False,
+                           spatial_scale=1.0,
+                           group_size=[1, 1],
+                           pooled_height=1,
+                           pooled_width=1,
+                           part_size=None,
+                           sample_per_part=1,
+                           trans_std=0.1,
+                           position_sensitive=False,
+                           name=None):
+    """
+    Deformable PSROI Pooling Layer
+    
+    Args:
+       input (Variable):The input of Deformable PSROIPooling.The shape of input tensor is 
+                        [N,C,H,W]. Where N is batch size,C is number of input channels,H 
+                        is height of the feature, and W is the width of the feature.
+       rois (Variable): ROIs (Regions of Interest) to pool over.It should be
+                        a 2-D LoDTensor of shape (num_rois, 4), the lod level
+                        is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
+                        the top left coordinates, and (x2, y2) is the bottom
+                        right coordinates.
+       trans (Variable): Offset of features on ROIs while pooling.The format is NCHW, where 
+                         N is number of ROIs, C is number of channels, which indicate the offset distance 
+                         in the x and y directions, H is pooled height, and W is pooled width.
+       no_trans (bool): Whether to add offset to get new value or not while roi pooling, which 
+                          value is True or False. Default: False.
+       spatial_scale (float): Ratio of input feature map height (or width) to raw image height (or width).
+                             Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
+       group_size (list|tuple): The number of groups which input channels are divided.(eg.number of input channels 
+                         is k1*k2*(C+1), which k1 and k2 are group width and height and C+1 is number of output
+                         chanels. eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
+       pooled_height (integer): The pooled output height. Default: 1.
+       pooled_width (integer): The pooled output width. Default: 1.
+       part_size (list|tuple): The height and width of offset, eg.(4, 6), which height is 4 and width is 6, Default: 
+                        if None, default value is [pooled_height, pooled_width].
+       sample_per_part (integer): The number of samples in each bin. Default: 1.
+       trans_std (float): Coefficient of offset. Default: 0.1.
+       position_sensitive (bool): Whether to choose deformable psroi pooling mode or not. Default: False.
+       name (str): Name of layer. Default: None.
+    Returns:
+        Variable: The tensor variable storing the deformable psroi pooling \
+                  result.
+
+
+    Examples:
+      .. code-block:: python
+
+        input = fluid.layers.data(name="input",
+                                  shape=[2, 192, 64, 64], 
+                                  dtype='float32', 
+                                  append_batch_size=False)                   
+        rois = fluid.layers.data(name="rois",
+                                 shape=[4],
+                                 dtype='float32', 
+                                 lod_level=1)
+        trans = fluid.layers.data(name="trans",
+                                  shape=[2, 384, 64, 64], 
+                                  dtype='float32', 
+                                  append_batch_size=False) 
+        x = fluid.layers.nn.deformable_roi_pooling(input=input, 
+                                                     rois=rois, 
+                                                     trans=trans, 
+                                                     no_trans=False,
+                                                     spatial_scale=1.0, 
+                                                     group_size=(1, 1),
+                                                     pooled_height=8,
+                                                     pooled_width=8,
+                                                     part_size=(8, 8),
+                                                     sample_per_part=4, 
+                                                     trans_std=0.1,
+                                                     position_sensitive=False)
+    """
+
+    input_channels = input.shape[1]
+    if position_sensitive == False:
+        output_channels = input_channels
+    else:
+        output_channels = input_channels / pooled_height / pooled_width
+
+    if part_size is None:
+        part_height = pooled_height
+        part_width = pooled_width
+        part_size = [part_height, part_width]
+    part_size = utils.convert_to_list(part_size, 2, 'part_size')
+    group_size = utils.convert_to_list(group_size, 2, 'group_size')
+    helper = LayerHelper('deformable_psroi_pooling', **locals())
+    dtype = helper.input_dtype()
+    output = helper.create_variable_for_type_inference(dtype)
+    top_count = helper.create_variable_for_type_inference(dtype='int32')
+    helper.append_op(
+        type="deformable_psroi_pooling",
+        inputs={"Input": input,
+                "ROIs": rois,
+                "Trans": trans},
+        outputs={"Output": output,
+                 "TopCount": top_count},
+        attrs={
+            "no_trans": no_trans,
+            "spatial_scale": spatial_scale,
+            "output_dim": output_channels,
+            "group_size": group_size,
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "part_size": part_size,
+            "sample_per_part": sample_per_part,
+            "trans_std": trans_std
+        })
+    return output
--- a/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def set_input(input, rois, trans):
+    inputs = {'Input': input, "ROIs": rois, "Trans": trans}
+    return inputs
+
+
+def set_attrs(no_trans, spatial_scale, output_channels, group_size,
+              pooled_height, pooled_width, part_size, sample_per_part,
+              trans_std):
+    attrs = {
+        'no_trans': no_trans,
+        'spatial_scale': spatial_scale,
+        'output_dim': output_channels,
+        'group_size': group_size,
+        'pooled_height': pooled_height,
+        'pooled_width': pooled_width,
+        'part_size': part_size,
+        'sample_per_part': sample_per_part,
+        'trans_std': trans_std
+    }
+    return attrs
+
+
+def set_outputs(output, top_count):
+    outputs = {
+        'Output': output.astype('float32'),
+        'TopCount': top_count.astype('float32')
+    }
+    return outputs
+
+
+class TestDeformablePSROIPoolOp(OpTest):
+    def set_data(self):
+        self.start_test1()
+        self.start_test2()
+        self.start_test3()
+        self.start_test4()
+
+    def start_test1(self):
+        self.init_test_case1()
+        self.make_rois()
+        self.calc_deformable_psroi_pooling()
+
+        inputs = self.input
+        rois = (self.rois[:, 1:5], self.rois_lod)
+        trans = self.trans
+        self.inputs = set_input(inputs, rois, trans)
+
+        no_trans = self.no_trans
+        spatial_scale = self.spatial_scale
+        output_channels = self.output_channels
+        group_size = self.group_size
+        pooled_height = self.pooled_height
+        pooled_width = self.pooled_width
+        part_size = self.part_size
+        sample_per_part = self.sample_per_part
+        trans_std = self.trans_std
+
+        self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
+                               group_size, pooled_height, pooled_width,
+                               part_size, sample_per_part, trans_std)
+
+        output = self.out.astype('float32')
+        top_count = self.top_count.astype('float32')
+        self.outputs = set_outputs(output, top_count)
+
+    def start_test2(self):
+        self.init_test_case2()
+        self.make_rois()
+        self.calc_deformable_psroi_pooling()
+
+        inputs = self.input
+        rois = (self.rois[:, 1:5], self.rois_lod)
+        trans = self.trans
+        self.inputs = set_input(inputs, rois, trans)
+
+        no_trans = self.no_trans
+        spatial_scale = self.spatial_scale
+        output_channels = self.output_channels
+        group_size = self.group_size
+        pooled_height = self.pooled_height
+        pooled_width = self.pooled_width
+        part_size = self.part_size
+        sample_per_part = self.sample_per_part
+        trans_std = self.trans_std
+
+        self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
+                               group_size, pooled_height, pooled_width,
+                               part_size, sample_per_part, trans_std)
+
+        output = self.out.astype('float32')
+        top_count = self.top_count.astype('float32')
+        self.outputs = set_outputs(output, top_count)
+
+    def start_test3(self):
+        self.init_test_case3()
+        self.make_rois()
+        self.calc_deformable_psroi_pooling()
+
+        inputs = self.input
+        rois = (self.rois[:, 1:5], self.rois_lod)
+        trans = self.trans
+        self.inputs = set_input(inputs, rois, trans)
+
+        no_trans = self.no_trans
+        spatial_scale = self.spatial_scale
+        output_channels = self.output_channels
+        group_size = self.group_size
+        pooled_height = self.pooled_height
+        pooled_width = self.pooled_width
+        part_size = self.part_size
+        sample_per_part = self.sample_per_part
+        trans_std = self.trans_std
+
+        self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
+                               group_size, pooled_height, pooled_width,
+                               part_size, sample_per_part, trans_std)
+
+        output = self.out.astype('float32')
+        top_count = self.top_count.astype('float32')
+        self.outputs = set_outputs(output, top_count)
+
+    def start_test4(self):
+        self.init_test_case4()
+        self.make_rois()
+        self.calc_deformable_psroi_pooling()
+
+        inputs = self.input
+        rois = (self.rois[:, 1:5], self.rois_lod)
+        trans = self.trans
+        self.inputs = set_input(inputs, rois, trans)
+
+        no_trans = self.no_trans
+        spatial_scale = self.spatial_scale
+        output_channels = self.output_channels
+        group_size = self.group_size
+        pooled_height = self.pooled_height
+        pooled_width = self.pooled_width
+        part_size = self.part_size
+        sample_per_part = self.sample_per_part
+        trans_std = self.trans_std
+
+        self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
+                               group_size, pooled_height, pooled_width,
+                               part_size, sample_per_part, trans_std)
+
+        output = self.out.astype('float32')
+        top_count = self.top_count.astype('float32')
+        self.outputs = set_outputs(output, top_count)
+
+    def init_test_case1(self):
+        self.batch_size = 3
+        self.channels = 3 * 2 * 2
+        self.height = 12
+        self.width = 12
+        self.input_dim = [
+            self.batch_size, self.channels, self.height, self.width
+        ]
+        self.no_trans = False
+        self.spatial_scale = 1.0 / 4.0
+        self.output_channels = 12
+        self.group_size = [1, 1]
+        self.pooled_height = 4
+        self.pooled_width = 4
+        self.part_size = [4, 4]
+        self.sample_per_part = 2
+        self.trans_std = 0.1
+        self.input = np.random.random(self.input_dim).astype('float32')
+
+    def init_test_case2(self):
+        self.batch_size = 2
+        self.channels = 3 * 2 * 2
+        self.height = 12
+        self.width = 12
+        self.input_dim = [
+            self.batch_size, self.channels, self.height, self.width
+        ]
+        self.no_trans = True
+        self.spatial_scale = 1.0 / 2.0
+        self.output_channels = 12
+        self.group_size = [1, 1]
+        self.pooled_height = 7
+        self.pooled_width = 7
+        self.part_size = [7, 7]
+        self.sample_per_part = 4
+        self.trans_std = 0.1
+        self.input = np.random.random(self.input_dim).astype('float32')
+
+    def init_test_case3(self):
+        self.batch_size = 2
+        self.channels = 3 * 2 * 2
+        self.height = 12
+        self.width = 12
+        self.input_dim = [
+            self.batch_size, self.channels, self.height, self.width
+        ]
+        self.no_trans = False
+        self.spatial_scale = 1.0 / 4.0
+        self.output_channels = 12
+        self.group_size = [1, 1]
+        self.pooled_height = 3
+        self.pooled_width = 3
+        self.part_size = [3, 3]
+        self.sample_per_part = 3
+        self.trans_std = 0.2
+        self.input = np.random.random(self.input_dim).astype('float32')
+
+    def init_test_case4(self):
+        self.batch_size = 2
+        self.channels = 3 * 2 * 2
+        self.height = 12
+        self.width = 12
+        self.input_dim = [
+            self.batch_size, self.channels, self.height, self.width
+        ]
+        self.no_trans = True
+        self.spatial_scale = 1.0 / 2.0
+        self.output_channels = 12
+        self.group_size = [1, 1]
+        self.pooled_height = 6
+        self.pooled_width = 2
+        self.part_size = [6, 6]
+        self.sample_per_part = 6
+        self.trans_std = 0.4
+        self.input = np.random.random(self.input_dim).astype('float32')
+
+    def make_rois(self):
+        rois = []
+        self.rois_lod = [[]]
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(bno + 1)
+            for i in range(bno + 1):
+                x_1 = np.random.random_integers(
+                    0, self.width // self.spatial_scale - self.pooled_width)
+                y_1 = np.random.random_integers(
+                    0, self.height // self.spatial_scale - self.pooled_height)
+                x_2 = np.random.random_integers(
+                    x_1 + self.pooled_width, self.width // self.spatial_scale)
+                y_2 = np.random.random_integers(
+                    y_1 + self.pooled_height, self.height // self.spatial_scale)
+                roi = [bno, x_1, y_1, x_2, y_2]
+                rois.append(roi)
+        self.rois_num = len(rois)
+        self.rois = np.array(rois).astype("float32")
+
+    def dmc_bilinear(self, data_im, p_h, p_w):
+        h_low = int(np.floor(p_h))
+        w_low = int(np.floor(p_w))
+        h_high = h_low + 1
+        w_high = w_low + 1
+        l_h = p_h - h_low
+        l_w = p_w - w_low
+        h_h = 1 - l_h
+        h_w = 1 - l_w
+        v_1 = 0
+        if h_low >= 0 and w_low >= 0:
+            v_1 = data_im[h_low, w_low]
+        v_2 = 0
+        if h_low >= 0 and w_high <= self.width - 1:
+            v_2 = data_im[h_low, w_high]
+        v_3 = 0
+        if h_high <= self.height - 1 and w_low >= 0:
+            v_3 = data_im[h_high, w_low]
+        v_4 = 0
+        if h_high <= self.height - 1 and w_high <= self.width - 1:
+            v_4 = data_im[h_high, w_high]
+        w_1, w_2, w_3, w_4 = h_h * h_w, h_h * l_w, l_h * h_w, l_h * l_w
+        val = w_1 * v_1 + w_2 * v_2 + w_3 * v_3 + w_4 * v_4
+        return val
+
+    def calc_deformable_psroi_pooling(self):
+        output_shape = (self.rois_num, self.output_channels, self.pooled_height,
+                        self.pooled_width)
+        self.out = np.zeros(output_shape)
+        self.trans = np.random.rand(self.rois_num, 2, self.part_size[0],
+                                    self.part_size[1]).astype('float32')
+        self.top_count = np.random.random((output_shape)).astype('float32')
+        count = self.rois_num * self.output_channels * self.pooled_height * self.pooled_width
+        for index in range(count):
+            p_w = int(index % self.pooled_width)
+            p_h = int(index / self.pooled_width % self.pooled_height)
+            ctop = int(index / self.pooled_width / self.pooled_height %
+                       self.output_channels)
+            n_out = int(index / self.pooled_width / self.pooled_height /
+                        self.output_channels)
+            roi = self.rois[n_out]
+            roi_batch_id = int(roi[0])
+            roi_start_w = int(np.round(roi[1])) * self.spatial_scale - 0.5
+            roi_start_h = int(np.round(roi[2])) * self.spatial_scale - 0.5
+            roi_end_w = int(np.round(roi[3] + 1)) * self.spatial_scale - 0.5
+            roi_end_h = int(np.round(roi[4] + 1)) * self.spatial_scale - 0.5
+            roi_width = max(roi_end_w - roi_start_w, 0.1)
+            roi_height = max(roi_end_h - roi_start_h, 0.1)
+            bin_size_h = float(roi_height) / float(self.pooled_height)
+            bin_size_w = float(roi_width) / float(self.pooled_width)
+            sub_bin_size_h = bin_size_h / self.sample_per_part
+            sub_bin_size_w = bin_size_w / self.sample_per_part
+            part_h = int(np.floor(p_h) / self.pooled_height * self.part_size[0])
+            part_w = int(np.floor(p_w) / self.pooled_width * self.part_size[1])
+            if self.no_trans:
+                trans_x = 0
+                trans_y = 0
+            else:
+                trans_x = self.trans[n_out][0][part_h][part_w] * self.trans_std
+                trans_y = self.trans[n_out][1][part_h][part_w] * self.trans_std
+            wstart = p_w * bin_size_w + roi_start_w
+            wstart = wstart + trans_x * roi_width
+            hstart = p_h * bin_size_h + roi_start_h
+            hstart = hstart + trans_y * roi_height
+            sum = 0
+            num_sample = 0
+            g_w = np.floor(p_w * self.group_size[0] / self.pooled_height)
+            g_h = np.floor(p_h * self.group_size[1] / self.pooled_width)
+            g_w = min(max(g_w, 0), self.group_size[0] - 1)
+            g_h = min(max(g_h, 0), self.group_size[1] - 1)
+            input_i = self.input[roi_batch_id]
+            for i_w in range(self.sample_per_part):
+                for i_h in range(self.sample_per_part):
+                    w_sample = wstart + i_w * sub_bin_size_w
+                    h_sample = hstart + i_h * sub_bin_size_h
+                    if w_sample < -0.5 or w_sample > self.width - 0.5 or \
+                    h_sample < -0.5 or h_sample > self.height - 0.5:
+                        continue
+                    w_sample = min(max(w_sample, 0.), self.width - 1.)
+                    h_sample = min(max(h_sample, 0.), self.height - 1.)
+                    c_sample = int((ctop * self.group_size[0] + g_h) *
+                                   self.group_size[1] + g_w)
+                    val = self.dmc_bilinear(input_i[c_sample], h_sample,
+                                            w_sample)
+                    sum = sum + val
+                    num_sample = num_sample + 1
+            if num_sample == 0:
+                self.out[n_out][ctop][p_h][p_w] = 0
+            else:
+                self.out[n_out][ctop][p_h][p_w] = sum / num_sample
+            self.top_count[n_out][ctop][p_h][p_w] = num_sample
+
+    def setUp(self):
+        self.op_type = "deformable_psroi_pooling"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Input'], 'Output')
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1989,6 +1989,35 @@ class TestBook(LayerTest):
                    padding=1)
                return (out)

+    def test_deform_roi_pooling(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = layers.data(
+                name='input',
+                shape=[2, 3, 32, 32],
+                dtype='float32',
+                append_batch_size=False)
+            rois = layers.data(
+                name="rois", shape=[4], dtype='float32', lod_level=1)
+            trans = layers.data(
+                name="trans",
+                shape=[2, 3, 32, 32],
+                dtype='float32',
+                append_batch_size=False)
+            out = layers.deformable_roi_pooling(
+                input=input,
+                rois=rois,
+                trans=trans,
+                no_trans=False,
+                spatial_scale=1.0,
+                group_size=(1, 1),
+                pooled_height=8,
+                pooled_width=8,
+                part_size=(8, 8),
+                sample_per_part=4,
+                trans_std=0.1)
+        return (out)
+

 if __name__ == '__main__':
    unittest.main()