add offset parameter in roi_align,generate_proposals.etc ops (#30864)

* add parameter in roi_align op

add offset parameter in roi_align,generate_proposals.etc ops (#30864)
* add parameter in roi_align op
5b267474 · Guanghua Yu · GitHub · 75f81233 · 5b267474 · 5b267474
15 changed file
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -77,17 +77,20 @@ struct BoxDecodeAndClipFunctor {
  const T *var;
  const int *index;
  const T *im_info;
+  const bool pixel_offset;
  T *proposals;
  BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var,
-                          const int *index, const T *im_info, T *proposals)
+                          const int *index, const T *im_info, T *proposals,
+                          bool pixel_offset = true)
      : anchor(anchor),
        deltas(deltas),
        var(var),
        index(index),
        im_info(im_info),
-        proposals(proposals) {}
+        proposals(proposals),
+        pixel_offset(pixel_offset) {}
  T bbox_clip_default{static_cast<T>(kBBoxClipDefault)};
@@ -98,8 +101,9 @@ struct BoxDecodeAndClipFunctor {
    T axmax = anchor[k + 2];
    T aymax = anchor[k + 3];
-    T w = axmax - axmin + 1.0;
+    T offset = pixel_offset ? static_cast<T>(1.0) : 0;
-    T h = aymax - aymin + 1.0;
+    T w = axmax - axmin + offset;
+    T h = aymax - aymin + offset;
    T cx = axmin + 0.5 * w;
    T cy = aymin + 0.5 * h;
@@ -123,13 +127,13 @@ struct BoxDecodeAndClipFunctor {
    T oxmin = d_cx - d_w * 0.5;
    T oymin = d_cy - d_h * 0.5;
-    T oxmax = d_cx + d_w * 0.5 - 1.;
+    T oxmax = d_cx + d_w * 0.5 - offset;
-    T oymax = d_cy + d_h * 0.5 - 1.;
+    T oymax = d_cy + d_h * 0.5 - offset;
-    proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.);
+    proposals[i * 4] = Max(Min(oxmin, im_info[1] - offset), 0.);
-    proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.);
+    proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - offset), 0.);
-    proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - offset), 0.);
-    proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.);
+    proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - offset), 0.);
  }
  __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; }
@@ -141,7 +145,8 @@ template <typename T, int BlockSize>
 static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
                                    const T min_size, const int num,
                                    int *keep_num, int *keep,
-                                    bool is_scale = true) {
+                                    bool is_scale = true,
+                                    bool pixel_offset = true) {
  T im_h = im_info[0];
  T im_w = im_info[1];
@@ -157,9 +162,10 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
    T ymin = bboxes[k + 1];
    T xmax = bboxes[k + 2];
    T ymax = bboxes[k + 3];
+    T offset = pixel_offset ? static_cast<T>(1.0) : 0;
-    T w = xmax - xmin + 1.0;
+    T w = xmax - xmin + offset;
-    T h = ymax - ymin + 1.0;
+    T h = ymax - ymin + offset;
+    if (pixel_offset) {
      T cx = xmin + w / 2.;
      T cy = ymin + h / 2.;
@@ -171,6 +177,11 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
      if (w >= min_size && h >= min_size && cx <= im_w && cy <= im_h) {
        keep_index[threadIdx.x] = i;
      }
+    } else {
+      if (w >= min_size && h >= min_size) {
+        keep_index[threadIdx.x] = i;
+      }
+    }
    __syncthreads();
    if (threadIdx.x == 0) {
      int size = (num - i) < BlockSize ? num - i : BlockSize;
@@ -187,19 +198,23 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
  }
 }
-static __device__ float IoU(const float *a, const float *b) {
+static __device__ float IoU(const float *a, const float *b,
+                            const bool pixel_offset = true) {
+  float offset = pixel_offset ? static_cast<float>(1.0) : 0;
  float left = max(a[0], b[0]), right = min(a[2], b[2]);
  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float width = max(right - left + offset, 0.f),
+        height = max(bottom - top + offset, 0.f);
  float inter_s = width * height;
-  float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float s_a = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
-  float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  float s_b = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
  return inter_s / (s_a + s_b - inter_s);
 }
 static __global__ void NMSKernel(const int n_boxes,
                                 const float nms_overlap_thresh,
-                                 const float *dev_boxes, uint64_t *dev_mask) {
+                                 const float *dev_boxes, uint64_t *dev_mask,
+                                 bool pixel_offset = true) {
  const int row_start = blockIdx.y;
  const int col_start = blockIdx.x;
@@ -231,7 +246,8 @@ static __global__ void NMSKernel(const int n_boxes,
      start = threadIdx.x + 1;
    }
    for (i = start; i < col_size; i++) {
-      if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
+      if (IoU(cur_box, block_boxes + i * 4, pixel_offset) >
+          nms_overlap_thresh) {
        t |= 1ULL << i;
      }
    }
@@ -243,7 +259,7 @@ static __global__ void NMSKernel(const int n_boxes,
 template <typename T>
 static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
                const Tensor &sorted_indices, const T nms_threshold,
-                Tensor *keep_out) {
+                Tensor *keep_out, bool pixel_offset = true) {
  int boxes_num = proposals.dims()[0];
  const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
  dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
@@ -255,7 +271,8 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
  framework::Vector<uint64_t> mask(boxes_num * col_blocks);
  NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes,
                                 mask.CUDAMutableData(BOOST_GET_CONST(
-                                     platform::CUDAPlace, ctx.GetPlace())));
+                                     platform::CUDAPlace, ctx.GetPlace())),
+                                 pixel_offset);
  std::vector<uint64_t> remv(col_blocks);
  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);

--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -31,7 +31,7 @@ struct RangeInitFunctor {
 };
 template <typename T>
-inline HOSTDEVICE T RoIArea(const T* box, bool normalized) {
+inline HOSTDEVICE T RoIArea(const T* box, bool pixel_offset = true) {
  if (box[2] < box[0] || box[3] < box[1]) {
    // If coordinate values are is invalid
    // (e.g. xmax < xmin or ymax < ymin), return 0.
@@ -39,11 +39,11 @@ inline HOSTDEVICE T RoIArea(const T* box, bool normalized) {
  } else {
    const T w = box[2] - box[0];
    const T h = box[3] - box[1];
-    if (normalized) {
+    if (pixel_offset) {
-      return w * h;
-    } else {
      // If coordinate values are not within range [0, 1].
      return (w + 1) * (h + 1);
+    } else {
+      return w * h;
    }
  }
 }
@@ -157,10 +157,12 @@ template <class T>
 void ClipTiledBoxes(const platform::DeviceContext& ctx,
                    const framework::Tensor& im_info,
                    const framework::Tensor& input_boxes,
-                    framework::Tensor* out, bool is_scale = true) {
+                    framework::Tensor* out, bool is_scale = true,
+                    bool pixel_offset = true) {
  T* out_data = out->mutable_data<T>(ctx.GetPlace());
  const T* im_info_data = im_info.data<T>();
  const T* input_boxes_data = input_boxes.data<T>();
+  T offset = pixel_offset ? static_cast<T>(1.0) : 0;
  T zero(0);
  T im_w =
      is_scale ? round(im_info_data[1] / im_info_data[2]) : im_info_data[1];
@@ -168,13 +170,17 @@ void ClipTiledBoxes(const platform::DeviceContext& ctx,
      is_scale ? round(im_info_data[0] / im_info_data[2]) : im_info_data[0];
  for (int64_t i = 0; i < input_boxes.numel(); ++i) {
    if (i % 4 == 0) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
+      out_data[i] =
+          std::max(std::min(input_boxes_data[i], im_w - offset), zero);
    } else if (i % 4 == 1) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
+      out_data[i] =
+          std::max(std::min(input_boxes_data[i], im_h - offset), zero);
    } else if (i % 4 == 2) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
+      out_data[i] =
+          std::max(std::min(input_boxes_data[i], im_w - offset), zero);
    } else {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
+      out_data[i] =
+          std::max(std::min(input_boxes_data[i], im_h - offset), zero);
    }
  }
 }
@@ -184,30 +190,36 @@ template <class T>
 void FilterBoxes(const platform::DeviceContext& ctx,
                 const framework::Tensor* boxes, float min_size,
                 const framework::Tensor& im_info, bool is_scale,
-                 framework::Tensor* keep) {
+                 framework::Tensor* keep, bool pixel_offset = true) {
  const T* im_info_data = im_info.data<T>();
  const T* boxes_data = boxes->data<T>();
  keep->Resize({boxes->dims()[0]});
  min_size = std::max(min_size, 1.0f);
  int* keep_data = keep->mutable_data<int>(ctx.GetPlace());
+  T offset = pixel_offset ? static_cast<T>(1.0) : 0;
  int keep_len = 0;
  for (int i = 0; i < boxes->dims()[0]; ++i) {
-    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
+    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + offset;
-    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
+    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + offset;
+    if (pixel_offset) {
      T x_ctr = boxes_data[4 * i] + ws / 2;
      T y_ctr = boxes_data[4 * i + 1] + hs / 2;
      if (is_scale) {
        ws = (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_info_data[2] + 1;
-      hs =
+        hs = (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_info_data[2] +
-          (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_info_data[2] + 1;
+             1;
      }
      if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] &&
          y_ctr <= im_info_data[0]) {
        keep_data[keep_len++] = i;
      }
+    } else {
+      if (ws >= min_size && hs >= min_size) {
+        keep_data[keep_len++] = i;
+      }
+    }
  }
  keep->Resize({keep_len});
 }
@@ -216,8 +228,8 @@ template <class T>
 static void BoxCoder(const platform::DeviceContext& ctx,
                     framework::Tensor* all_anchors,
                     framework::Tensor* bbox_deltas,
-                     framework::Tensor* variances,
+                     framework::Tensor* variances, framework::Tensor* proposals,
-                     framework::Tensor* proposals) {
+                     const bool pixel_offset = true) {
  T* proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
  int64_t row = all_anchors->dims()[0];
@@ -230,9 +242,11 @@ static void BoxCoder(const platform::DeviceContext& ctx,
    variances_data = variances->data<T>();
  }
+  T offset = pixel_offset ? static_cast<T>(1.0) : 0;
  for (int64_t i = 0; i < row; ++i) {
-    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
+    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + offset;
-    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
+    T anchor_height =
+        anchor_data[i * len + 3] - anchor_data[i * len + 1] + offset;
    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
@@ -270,8 +284,8 @@ static void BoxCoder(const platform::DeviceContext& ctx,
    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
+    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - offset;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
+    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - offset;
  }
  // return proposals;
 }

--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
@@ -103,6 +103,9 @@ class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("refer_scale",
                 "The referring scale of FPN layer with"
                 " specified level");
+    AddAttr<bool>("pixel_offset", "(bool, default True),",
+                  "If true, im_shape pixel offset is 1.")
+        .SetDefault(true);
    AddComment(R"DOC(
 This operator distribute all proposals into different fpn level,
 with respect to scale of the proposals, the referring scale and
@@ -134,4 +137,8 @@ REGISTER_OP_VERSION(distribute_fpn_proposals)
            .NewOutput("MultiLevelRoisNum",
                       "The RoIs' number of each image on multiple "
                       "levels. The number on each level has the shape of (B),"
-                       "B is the number of images."));
+                       "B is the number of images."))
+    .AddCheckpoint(
+        R"ROC(Register distribute_fpn_proposals for adding the attribute of pixel_offset)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "pixel_offset", "If true, im_shape pixel offset is 1.", true));
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -43,15 +43,15 @@ __global__ void GPUDistFpnProposalsHelper(
    const int nthreads, const T* rois, const int lod_size,
    const int refer_level, const int refer_scale, const int max_level,
    const int min_level, int* roi_batch_id_data, int* sub_lod_list,
-    int* target_lvls) {
+    int* target_lvls, bool pixel_offset = true) {
  CUDA_KERNEL_LOOP(i, nthreads) {
    const T* offset_roi = rois + i * BBoxSize;
    int roi_batch_ind = roi_batch_id_data[i];
    // get the target level of current rois
-    T roi_area = RoIArea(offset_roi, false);
+    T roi_area = RoIArea(offset_roi, pixel_offset);
    T roi_scale = sqrt(roi_area);
    int tgt_lvl = floor(
-        log2(roi_scale / static_cast<T>(refer_scale) + (T)1e-6) + refer_level);
+        log2(roi_scale / static_cast<T>(refer_scale) + (T)1e-8) + refer_level);
    tgt_lvl = min(max_level, max(tgt_lvl, min_level));
    target_lvls[i] = tgt_lvl;
    // compute number of rois in the same batch and same target level
@@ -73,6 +73,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
    const int max_level = ctx.Attr<int>("max_level");
    const int refer_level = ctx.Attr<int>("refer_level");
    const int refer_scale = ctx.Attr<int>("refer_scale");
+    const bool pixel_offset = ctx.Attr<bool>("pixel_offset");
    int num_level = max_level - min_level + 1;
    // check that the fpn_rois is not empty
@@ -126,7 +127,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads>>>(
        roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
        max_level, min_level, roi_batch_id_list_gpu.data<int>(),
-        sub_lod_list_data, target_lvls_data);
+        sub_lod_list_data, target_lvls_data, pixel_offset);
    dev_ctx.Wait();
    auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());

--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -44,7 +44,7 @@ inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
 }
 template <typename T>
-static inline T BBoxArea(const T* box, bool normalized) {
+static inline T BBoxArea(const T* box, bool pixel_offset) {
  if (box[2] < box[0] || box[3] < box[1]) {
    // If coordinate values are is invalid
    // (e.g. xmax < xmin or ymax < ymin), return 0.
@@ -52,11 +52,11 @@ static inline T BBoxArea(const T* box, bool normalized) {
  } else {
    const T w = box[2] - box[0];
    const T h = box[3] - box[1];
-    if (normalized) {
+    if (pixel_offset) {
-      return w * h;
-    } else {
      // If coordinate values are not within range [0, 1].
      return (w + 1) * (h + 1);
+    } else {
+      return w * h;
    }
  }
 }
@@ -77,6 +77,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
    const int max_level = context.Attr<int>("max_level");
    const int refer_level = context.Attr<int>("refer_level");
    const int refer_scale = context.Attr<int>("refer_scale");
+    const bool pixel_offset = context.Attr<bool>("pixel_offset");
    const int num_level = max_level - min_level + 1;
    // check that the fpn_rois is not empty
@@ -108,7 +109,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
      const T* rois_data = fpn_rois_slice.data<T>();
      for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
        // get the target level of current rois
-        T roi_scale = std::sqrt(BBoxArea(rois_data, false));
+        T roi_scale = std::sqrt(BBoxArea(rois_data, pixel_offset));
        int tgt_lvl = std::floor(std::log2(roi_scale / refer_scale + (T)1e-6) +
                                 refer_level);
        tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));

--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -87,6 +87,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
    float nms_thresh = context.Attr<float>("nms_thresh");
    float min_size = context.Attr<float>("min_size");
    float eta = context.Attr<float>("eta");
+    bool pixel_offset = context.Attr<bool>("pixel_offset");
    auto &dev_ctx =
        context.template device_context<platform::CPUDeviceContext>();
@@ -134,10 +135,10 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
      scores_slice.Resize({h_score * w_score * c_score, 1});
-      std::pair<Tensor, Tensor> tensor_pair =
+      std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage(
-          ProposalForOneImage(dev_ctx, im_shape_slice, anchors, variances,
+          dev_ctx, im_shape_slice, anchors, variances, bbox_deltas_slice,
-                              bbox_deltas_slice, scores_slice, pre_nms_top_n,
+          scores_slice, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size,
-                              post_nms_top_n, nms_thresh, min_size, eta);
+          eta, pixel_offset);
      Tensor &proposals = tensor_pair.first;
      Tensor &scores = tensor_pair.second;
@@ -168,7 +169,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
      const Tensor &bbox_deltas_slice,  // [M, 4]
      const Tensor &scores_slice,       // [N, 1]
      int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
-      float eta) const {
+      float eta, bool pixel_offset = true) const {
    auto *scores_data = scores_slice.data<T>();
    // Sort index
@@ -203,12 +204,15 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
    Tensor proposals;
    proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
-    BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);
+    BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals,
+                pixel_offset);
-    ClipTiledBoxes<T>(ctx, im_shape_slice, proposals, &proposals, false);
+    ClipTiledBoxes<T>(ctx, im_shape_slice, proposals, &proposals, false,
+                      pixel_offset);
    Tensor keep;
-    FilterBoxes<T>(ctx, &proposals, min_size, im_shape_slice, false, &keep);
+    FilterBoxes<T>(ctx, &proposals, min_size, im_shape_slice, false, &keep,
+                   pixel_offset);
    // Handle the case when there is no keep index left
    if (keep.numel() == 0) {
      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
@@ -229,7 +233,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
      return std::make_pair(bbox_sel, scores_filter);
    }
-    Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
+    Tensor keep_nms =
+        NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta, pixel_offset);
    if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
      keep_nms.Resize({post_nms_top_n});
@@ -280,6 +285,9 @@ class GenerateProposalsV2OpMaker : public framework::OpProtoAndCheckerMaker {
                   "Proposal height and width both need to be greater "
                   "than this min_size.");
    AddAttr<float>("eta", "The parameter for adaptive NMS.");
+    AddAttr<bool>("pixel_offset", "(bool, default True),",
+                  "If true, im_shape pixel offset is 1.")
+        .SetDefault(true);
    AddComment(R"DOC(
 This operator is the second version of generate_proposals op to generate 
 bounding box proposals for Faster RCNN.
@@ -312,3 +320,8 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(generate_proposals_v2,
                       ops::GenerateProposalsV2Kernel<float>,
                       ops::GenerateProposalsV2Kernel<double>);
+REGISTER_OP_VERSION(generate_proposals_v2)
+    .AddCheckpoint(
+        R"ROC(Registe generate_proposals_v2 for adding the attribute of pixel_offset)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "pixel_offset", "If true, im_shape pixel offset is 1.", true));
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -36,7 +36,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
    const Tensor &bbox_deltas,  // [M, 4]
    const Tensor &scores,       // [N, 1]
    int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
-    float eta) {
+    float eta, bool pixel_offset) {
  // 1. pre nms
  Tensor scores_sort, index_sort;
  SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
@@ -54,7 +54,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, pre_nms_num);
    for_range(BoxDecodeAndClipFunctor<T>{
        anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
-        index_sort.data<int>(), im_shape.data<T>(), proposals.data<T>()});
+        index_sort.data<int>(), im_shape.data<T>(), proposals.data<T>(),
+        pixel_offset});
  }
  // 3. filter
@@ -65,7 +66,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
  auto stream = ctx.stream();
  FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
      proposals.data<T>(), im_shape.data<T>(), min_size, pre_nms_num,
-      keep_num_t.data<int>(), keep_index.data<int>(), false);
+      keep_num_t.data<int>(), keep_index.data<int>(), false, pixel_offset);
  int keep_num;
  const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
  memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
@@ -94,7 +95,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
  // 4. nms
  Tensor keep_nms;
-  NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
+  NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms,
+         pixel_offset);
  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
    keep_nms.Resize({post_nms_top_n});
  }
@@ -129,6 +131,7 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel<T> {
    float nms_thresh = context.Attr<float>("nms_thresh");
    float min_size = context.Attr<float>("min_size");
    float eta = context.Attr<float>("eta");
+    bool pixel_offset = context.Attr<bool>("pixel_offset");
    PADDLE_ENFORCE_GE(eta, 1.,
                      platform::errors::InvalidArgument(
                          "Not support adaptive NMS. The attribute 'eta' "
@@ -184,10 +187,10 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel<T> {
      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
      scores_slice.Resize({h_score * w_score * c_score, 1});
-      std::pair<Tensor, Tensor> box_score_pair =
+      std::pair<Tensor, Tensor> box_score_pair = ProposalForOneImage<T>(
-          ProposalForOneImage<T>(dev_ctx, im_shape_slice, anchors, variances,
+          dev_ctx, im_shape_slice, anchors, variances, bbox_deltas_slice,
-                                 bbox_deltas_slice, scores_slice, pre_nms_top_n,
+          scores_slice, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size,
-                                 post_nms_top_n, nms_thresh, min_size, eta);
+          eta, pixel_offset);
      Tensor &proposals = box_score_pair.first;
      Tensor &scores = box_score_pair.second;

--- a/paddle/fluid/operators/detection/nms_util.h
+++ b/paddle/fluid/operators/detection/nms_util.h
@@ -130,7 +130,7 @@ static inline framework::Tensor VectorToTensor(
 template <class T>
 framework::Tensor NMS(const platform::DeviceContext& ctx,
                      framework::Tensor* bbox, framework::Tensor* scores,
-                      T nms_threshold, float eta) {
+                      T nms_threshold, float eta, bool pixel_offset = true) {
  int64_t num_boxes = bbox->dims()[0];
  // 4: [xmin ymin xmax ymax]
  int64_t box_size = bbox->dims()[1];
@@ -144,13 +144,15 @@ framework::Tensor NMS(const platform::DeviceContext& ctx,
  int selected_num = 0;
  T adaptive_threshold = nms_threshold;
  const T* bbox_data = bbox->data<T>();
+  bool normalized = pixel_offset ? false : true;
  while (sorted_indices.size() != 0) {
    int idx = sorted_indices.back().second;
    bool flag = true;
    for (int kept_idx : selected_indices) {
      if (flag) {
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+        T overlap =
-                                      bbox_data + kept_idx * box_size, false);
+            JaccardOverlap<T>(bbox_data + idx * box_size,
+                              bbox_data + kept_idx * box_size, normalized);
        flag = (overlap <= adaptive_threshold);
      } else {
        break;

--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -175,6 +175,10 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
                 "If <=0, then grid points are adaptive to roi_width "
                 "and pooled_w, likewise for height")
        .SetDefault(-1);
+    AddAttr<bool>("aligned",
+                  "(bool, default False),"
+                  "If true, pixel shift it by -0.5 for align more perfectly")
+        .SetDefault(false);
    AddComment(R"DOC(
 **RoIAlign Operator**
@@ -245,4 +249,11 @@ REGISTER_OP_VERSION(roi_align)
             Upgrade roi_align add a new input [RoisNum])ROC",
        paddle::framework::compatible::OpVersionDesc().NewInput(
            "RoisNum",
-            "The number of RoIs in each image. RoisNum is dispensable."));
+            "The number of RoIs in each image. RoisNum is dispensable."))
+    .AddCheckpoint(
+        R"ROC(
+             Upgrade roi_align add a new input [aligned])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "aligned",
+            "If true, pixel shift it by -0.5 for align more perfectly.",
+            false));
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -105,7 +105,8 @@ __global__ void GPUROIAlignForward(
    const int nthreads, const T* input_data, const T* input_rois,
    const float spatial_scale, const int channels, const int height,
    const int width, const int pooled_height, const int pooled_width,
-    const int sampling_ratio, int* roi_batch_id_data, T* output_data) {
+    const int sampling_ratio, int* roi_batch_id_data, T* output_data,
+    const bool continuous_coordinate) {
  CUDA_KERNEL_LOOP(i, nthreads) {
    int pw = i % pooled_width;
    int ph = (i / pooled_width) % pooled_height;
@@ -115,13 +116,19 @@ __global__ void GPUROIAlignForward(
    const T* offset_input_rois = input_rois + n * kROISize;
    int roi_batch_ind = roi_batch_id_data[n];
-    T roi_xmin = offset_input_rois[0] * spatial_scale;
+    T roi_offset = continuous_coordinate ? static_cast<T>(0.5) : 0;
-    T roi_ymin = offset_input_rois[1] * spatial_scale;
+    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
-    T roi_xmax = offset_input_rois[2] * spatial_scale;
+    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
-    T roi_ymax = offset_input_rois[3] * spatial_scale;
+    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
+    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
-    T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.));
+    T roi_width = roi_xmax - roi_xmin;
-    T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.));
+    T roi_height = roi_ymax - roi_ymin;
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
@@ -153,14 +160,12 @@ __global__ void GPUROIAlignForward(
 }
 template <typename T>
-__global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois,
+__global__ void GPUROIAlignBackward(
-                                    const T* out_grad, const int num_rois,
+    const int nthreads, const T* input_rois, const T* out_grad,
-                                    const float spatial_scale,
+    const int num_rois, const float spatial_scale, const int channels,
-                                    const int channels, const int height,
+    const int height, const int width, const int pooled_height,
-                                    const int width, const int pooled_height,
+    const int pooled_width, const int sampling_ratio, int* roi_batch_id_data,
-                                    const int pooled_width,
+    T* input_grad, const bool continuous_coordinate) {
-                                    const int sampling_ratio,
-                                    int* roi_batch_id_data, T* input_grad) {
  CUDA_KERNEL_LOOP(i, nthreads) {
    int pw = i % pooled_width;
    int ph = (i / pooled_width) % pooled_height;
@@ -169,13 +174,18 @@ __global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois,
    const T* offset_input_rois = input_rois + n * kROISize;
    int roi_batch_ind = roi_batch_id_data[n];
-    T roi_xmin = offset_input_rois[0] * spatial_scale;
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
-    T roi_ymin = offset_input_rois[1] * spatial_scale;
+    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
-    T roi_xmax = offset_input_rois[2] * spatial_scale;
+    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
-    T roi_ymax = offset_input_rois[3] * spatial_scale;
+    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
+    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
-    T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.));
-    T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.));
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
@@ -236,6 +246,7 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
    auto pooled_width = ctx.Attr<int>("pooled_width");
    auto spatial_scale = ctx.Attr<float>("spatial_scale");
    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto aligned = ctx.Attr<bool>("aligned");
    auto in_dims = in->dims();
    int batch_size = in_dims[0];
@@ -316,7 +327,7 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
    GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
        height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()));
+        out->mutable_data<T>(ctx.GetPlace()), aligned);
  }
 };
@@ -334,6 +345,7 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
    auto pooled_width = ctx.Attr<int>("pooled_width");
    auto spatial_scale = ctx.Attr<float>("spatial_scale");
    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto aligned = ctx.Attr<bool>("aligned");
    int rois_num = rois->dims()[0];
    int channels = in->dims()[1];
@@ -390,8 +402,8 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
      GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
          output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
          spatial_scale, channels, height, width, pooled_height, pooled_width,
-          sampling_ratio, roi_id_data,
+          sampling_ratio, roi_id_data, in_grad->mutable_data<T>(ctx.GetPlace()),
-          in_grad->mutable_data<T>(ctx.GetPlace()));
+          aligned);
    }
  }
 };

--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -145,6 +145,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
    auto pooled_width = ctx.Attr<int>("pooled_width");
    auto spatial_scale = ctx.Attr<float>("spatial_scale");
    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto aligned = ctx.Attr<bool>("aligned");
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
@@ -215,15 +216,21 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
    }
    T* output_data = out->mutable_data<T>(ctx.GetPlace());
    const T* rois_data = rois->data<T>();
+    T roi_offset = aligned ? T(0.5) : 0;
    for (int n = 0; n < rois_num; ++n) {
      int roi_batch_id = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale;
+      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
-      T roi_ymin = rois_data[1] * spatial_scale;
+      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
-      T roi_xmax = rois_data[2] * spatial_scale;
+      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
-      T roi_ymax = rois_data[3] * spatial_scale;
+      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
+      T roi_width = roi_xmax - roi_xmin;
+      T roi_height = roi_ymax - roi_ymin;
+      if (!aligned) {
+        roi_width = std::max(roi_width, static_cast<T>(1.));
+        roi_height = std::max(roi_height, static_cast<T>(1.));
+      }
-      T roi_width = std::max(roi_xmax - roi_xmin, static_cast<T>(1.));
-      T roi_height = std::max(roi_ymax - roi_ymin, static_cast<T>(1.));
      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
      const T* batch_data = input_data + roi_batch_id * in_stride[0];
@@ -290,6 +297,7 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
    auto spatial_scale = ctx.Attr<float>("spatial_scale");
    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
    auto in_dims = in->dims();
+    auto aligned = ctx.Attr<bool>("aligned");
    int channels = in_dims[1];
    int height = in_dims[2];
@@ -344,14 +352,21 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
    auto roi_stride = framework::stride(rois->dims());
    auto out_stride = framework::stride(out_grad->dims());
+    T roi_offset = aligned ? T(0.5) : 0;
    for (int n = 0; n < rois_num; ++n) {
      int roi_batch_idx = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale;
+      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
-      T roi_ymin = rois_data[1] * spatial_scale;
+      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
-      T roi_xmax = rois_data[2] * spatial_scale;
+      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
-      T roi_ymax = rois_data[3] * spatial_scale;
+      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
-      T roi_width = std::max(roi_xmax - roi_xmin, static_cast<T>(1.));
-      T roi_height = std::max(roi_ymax - roi_ymin, static_cast<T>(1.));
+      T roi_width = roi_xmax - roi_xmin;
+      T roi_height = roi_ymax - roi_ymin;
+      if (!aligned) {
+        roi_width = std::max(roi_width, static_cast<T>(1.));
+        roi_height = std::max(roi_height, static_cast<T>(1.));
+      }
      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
      for (int c = 0; c < channels; ++c) {

--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
@@ -31,7 +31,8 @@ class TestDistributeFPNProposalsOp(OpTest):
            'max_level': self.roi_max_level,
            'min_level': self.roi_min_level,
            'refer_scale': self.canonical_scale,
-            'refer_level': self.canonical_level
+            'refer_level': self.canonical_level,
+            'pixel_offset': self.pixel_offset,
        }
        output = [('out%d' % i, self.rois_fpn[i])
                  for i in range(len(self.rois_fpn))]
@@ -47,10 +48,12 @@ class TestDistributeFPNProposalsOp(OpTest):
        self.canonical_scale = 224
        self.canonical_level = 4
        self.images_shape = [512, 512]
+        self.pixel_offset = True
    def boxes_area(self, boxes):
-        w = (boxes[:, 2] - boxes[:, 0] + 1)
+        offset = 1 if self.pixel_offset else 0
-        h = (boxes[:, 3] - boxes[:, 1] + 1)
+        w = (boxes[:, 2] - boxes[:, 0] + offset)
+        h = (boxes[:, 3] - boxes[:, 1] + offset)
        areas = w * h
        assert np.all(areas >= 0), 'Negative areas founds'
        return areas
@@ -59,7 +62,7 @@ class TestDistributeFPNProposalsOp(OpTest):
        s = np.sqrt(self.boxes_area(rois))
        s0 = self.canonical_scale
        lvl0 = self.canonical_level
-        target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-6))
+        target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-8))
        target_lvls = np.clip(target_lvls, lvl_min, lvl_max)
        return target_lvls
@@ -131,7 +134,8 @@ class TestDistributeFPNProposalsOpWithRoisNum(TestDistributeFPNProposalsOp):
            'max_level': self.roi_max_level,
            'min_level': self.roi_min_level,
            'refer_scale': self.canonical_scale,
-            'refer_level': self.canonical_level
+            'refer_level': self.canonical_level,
+            'pixel_offset': self.pixel_offset,
        }
        output = [('out%d' % i, self.rois_fpn[i])
                  for i in range(len(self.rois_fpn))]
@@ -147,5 +151,16 @@ class TestDistributeFPNProposalsOpWithRoisNum(TestDistributeFPNProposalsOp):
        }
+class TestDistributeFPNProposalsOpNoOffset(
+        TestDistributeFPNProposalsOpWithRoisNum):
+    def init_test_case(self):
+        self.roi_max_level = 5
+        self.roi_min_level = 2
+        self.canonical_scale = 224
+        self.canonical_level = 4
+        self.images_shape = [512, 512]
+        self.pixel_offset = False
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -21,7 +21,6 @@ import math
 import paddle
 import paddle.fluid as fluid
 from op_test import OpTest
-from test_multiclass_nms_op import nms
 from test_anchor_generator_op import anchor_generator_in_python
 import copy
@@ -111,18 +110,19 @@ def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
    return proposals, scores
-def box_coder(all_anchors, bbox_deltas, variances):
+def box_coder(all_anchors, bbox_deltas, variances, pixel_offset=True):
    """
    Decode proposals by anchors and bbox_deltas from RPN 
    """
+    offset = 1 if pixel_offset else 0
    #proposals: xmin, ymin, xmax, ymax
    proposals = np.zeros_like(bbox_deltas, dtype=np.float32)
    #anchor_loc: width, height, center_x, center_y
    anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)
-    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1
+    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + offset
-    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1
+    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + offset
    anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
    anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]
@@ -152,51 +152,60 @@ def box_coder(all_anchors, bbox_deltas, variances):
            pred_bbox[i, 3] = math.exp(
                min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i,
                                                                            1]
    proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
    proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
-    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1
+    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - offset
-    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1
+    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - offset
    return proposals
-def clip_tiled_boxes(boxes, im_shape):
+def clip_tiled_boxes(boxes, im_shape, pixel_offset=True):
    """Clip boxes to image boundaries. im_shape is [height, width] and boxes
    has shape (N, 4 * num_tiled_boxes)."""
    assert boxes.shape[1] % 4 == 0, \
        'boxes.shape[1] is {:d}, but must be divisible by 4.'.format(
        boxes.shape[1]
    )
+    offset = 1 if pixel_offset else 0
    # x1 >= 0
-    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    boxes[:, 0::4] = np.maximum(
+        np.minimum(boxes[:, 0::4], im_shape[1] - offset), 0)
    # y1 >= 0
-    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    boxes[:, 1::4] = np.maximum(
+        np.minimum(boxes[:, 1::4], im_shape[0] - offset), 0)
    # x2 < im_shape[1]
-    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    boxes[:, 2::4] = np.maximum(
+        np.minimum(boxes[:, 2::4], im_shape[1] - offset), 0)
    # y2 < im_shape[0]
-    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    boxes[:, 3::4] = np.maximum(
+        np.minimum(boxes[:, 3::4], im_shape[0] - offset), 0)
    return boxes
-def filter_boxes(boxes, min_size, im_info):
+def filter_boxes(boxes, min_size, im_info, pixel_offset=True):
    """Only keep boxes with both sides >= min_size and center within the image.
    """
    # Scale min_size to match image scale
    im_scale = im_info[2]
    min_size = max(min_size, 1.0)
-    ws = boxes[:, 2] - boxes[:, 0] + 1
+    offset = 1 if pixel_offset else 0
-    hs = boxes[:, 3] - boxes[:, 1] + 1
+    ws = boxes[:, 2] - boxes[:, 0] + offset
+    hs = boxes[:, 3] - boxes[:, 1] + offset
+    if pixel_offset:
        ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1
        hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
        x_ctr = boxes[:, 0] + ws / 2.
        y_ctr = boxes[:, 1] + hs / 2.
-    keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) &
+        keep = np.where((ws_orig_scale >= min_size) & (
-                    (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0]
+            hs_orig_scale >= min_size) & (x_ctr < im_info[1]) & (y_ctr <
+                                                                 im_info[0]))[0]
+    else:
+        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
    return keep
-def iou(box_a, box_b):
+def iou(box_a, box_b, pixel_offset=True):
    """
 	Apply intersection-over-union overlap between box_a and box_b
    """
@@ -209,9 +218,9 @@ def iou(box_a, box_b):
    ymin_b = min(box_b[1], box_b[3])
    xmax_b = max(box_b[0], box_b[2])
    ymax_b = max(box_b[1], box_b[3])
+    offset = 1 if pixel_offset else 0
-    area_a = (ymax_a - ymin_a + 1) * (xmax_a - xmin_a + 1)
+    area_a = (ymax_a - ymin_a + offset) * (xmax_a - xmin_a + offset)
-    area_b = (ymax_b - ymin_b + 1) * (xmax_b - xmin_b + 1)
+    area_b = (ymax_b - ymin_b + offset) * (xmax_b - xmin_b + offset)
    if area_a <= 0 and area_b <= 0:
        return 0.0
@@ -220,14 +229,14 @@ def iou(box_a, box_b):
    xb = min(xmax_a, xmax_b)
    yb = min(ymax_a, ymax_b)
-    inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0)
+    inter_area = max(xb - xa + offset, 0.0) * max(yb - ya + offset, 0.0)
    iou_ratio = inter_area / (area_a + area_b - inter_area)
    return iou_ratio
-def nms(boxes, scores, nms_threshold, eta=1.0):
+def nms(boxes, scores, nms_threshold, eta=1.0, pixel_offset=True):
    """Apply non-maximum suppression at test time to avoid detecting too many
    overlapping bounding boxes for a given object.
    Args:
@@ -252,7 +261,9 @@ def nms(boxes, scores, nms_threshold, eta=1.0):
        for k in range(len(selected_indices)):
            if keep:
                kept_idx = selected_indices[k]
-                overlap = iou(boxes[idx], boxes[kept_idx])
+                overlap = iou(boxes[idx],
+                              boxes[kept_idx],
+                              pixel_offset=pixel_offset)
                keep = True if overlap <= adaptive_threshold else False
            else:
                break

--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py
@@ -21,7 +21,6 @@ import math
 import paddle
 import paddle.fluid as fluid
 from op_test import OpTest
-from test_multiclass_nms_op import nms
 from test_anchor_generator_op import anchor_generator_in_python
 import copy
 from test_generate_proposals_op import clip_tiled_boxes, box_coder, nms
@@ -29,7 +28,7 @@ from test_generate_proposals_op import clip_tiled_boxes, box_coder, nms
 def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors,
                                    variances, pre_nms_topN, post_nms_topN,
-                                    nms_thresh, min_size, eta):
+                                    nms_thresh, min_size, eta, pixel_offset):
    all_anchors = anchors.reshape(-1, 4)
    rois = np.empty((0, 5), dtype=np.float32)
    roi_probs = np.empty((0, 1), dtype=np.float32)
@@ -42,7 +41,8 @@ def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors,
        img_i_boxes, img_i_probs = proposal_for_one_image(
            im_shape[img_idx, :], all_anchors, variances,
            bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :],
-            pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta)
+            pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta,
+            pixel_offset)
        rois_num.append(img_i_probs.shape[0])
        rpn_rois.append(img_i_boxes)
        rpn_roi_probs.append(img_i_probs)
@@ -52,7 +52,7 @@ def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors,
 def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas,
                           scores, pre_nms_topN, post_nms_topN, nms_thresh,
-                           min_size, eta):
+                           min_size, eta, pixel_offset):
    # Transpose and reshape predicted bbox transformations to get them
    # into the same order as the anchors:
    #   - bbox deltas will be (4 * A, H, W) format from conv output
@@ -83,12 +83,12 @@ def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas,
    scores = scores[order, :]
    bbox_deltas = bbox_deltas[order, :]
    all_anchors = all_anchors[order, :]
-    proposals = box_coder(all_anchors, bbox_deltas, variances)
+    proposals = box_coder(all_anchors, bbox_deltas, variances, pixel_offset)
    # clip proposals to image (may result in proposals with zero area
    # that will be removed in the next step)
-    proposals = clip_tiled_boxes(proposals, im_shape)
+    proposals = clip_tiled_boxes(proposals, im_shape, pixel_offset)
    # remove predicted boxes with height or width < min_size
-    keep = filter_boxes(proposals, min_size, im_shape)
+    keep = filter_boxes(proposals, min_size, im_shape, pixel_offset)
    if len(keep) == 0:
        proposals = np.zeros((1, 4)).astype('float32')
        scores = np.zeros((1, 1)).astype('float32')
@@ -103,7 +103,8 @@ def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas,
        keep = nms(boxes=proposals,
                   scores=scores,
                   nms_threshold=nms_thresh,
-                   eta=eta)
+                   eta=eta,
+                   pixel_offset=pixel_offset)
        if post_nms_topN > 0 and post_nms_topN < len(keep):
            keep = keep[:post_nms_topN]
        proposals = proposals[keep, :]
@@ -112,17 +113,21 @@ def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas,
    return proposals, scores
-def filter_boxes(boxes, min_size, im_shape):
+def filter_boxes(boxes, min_size, im_shape, pixel_offset=True):
    """Only keep boxes with both sides >= min_size and center within the image.
    """
    # Scale min_size to match image scale
    min_size = max(min_size, 1.0)
-    ws = boxes[:, 2] - boxes[:, 0] + 1
+    offset = 1 if pixel_offset else 0
-    hs = boxes[:, 3] - boxes[:, 1] + 1
+    ws = boxes[:, 2] - boxes[:, 0] + offset
+    hs = boxes[:, 3] - boxes[:, 1] + offset
+    if pixel_offset:
        x_ctr = boxes[:, 0] + ws / 2.
        y_ctr = boxes[:, 1] + hs / 2.
-    keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_shape[1])
+        keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_shape[
-                    & (y_ctr < im_shape[0]))[0]
+            1]) & (y_ctr < im_shape[0]))[0]
+    else:
+        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
    return keep
@@ -144,7 +149,8 @@ class TestGenerateProposalsV2Op(OpTest):
            'post_nms_topN': self.post_nms_topN,
            'nms_thresh': self.nms_thresh,
            'min_size': self.min_size,
-            'eta': self.eta
+            'eta': self.eta,
+            'pixel_offset': self.pixel_offset,
        }
        self.outputs = {
@@ -165,6 +171,7 @@ class TestGenerateProposalsV2Op(OpTest):
        self.nms_thresh = 0.7
        self.min_size = 3.0
        self.eta = 1.
+        self.pixel_offset = True
    def init_test_input(self):
        batch_size = 1
@@ -191,7 +198,7 @@ class TestGenerateProposalsV2Op(OpTest):
        self.rpn_rois, self.rpn_roi_probs, self.rois_num = generate_proposals_v2_in_python(
            self.scores, self.bbox_deltas, self.im_shape, self.anchors,
            self.variances, self.pre_nms_topN, self.post_nms_topN,
-            self.nms_thresh, self.min_size, self.eta)
+            self.nms_thresh, self.min_size, self.eta, self.pixel_offset)
 class TestGenerateProposalsV2OutLodOp(TestGenerateProposalsV2Op):
@@ -231,6 +238,17 @@ class TestGenerateProposalsV2OpNoBoxLeft(TestGenerateProposalsV2Op):
        self.nms_thresh = 0.7
        self.min_size = 1000.0
        self.eta = 1.
+        self.pixel_offset = True
+class TestGenerateProposalsV2OpNoOffset(TestGenerateProposalsV2Op):
+    def init_test_params(self):
+        self.pre_nms_topN = 12000  # train 12000, test 2000
+        self.post_nms_topN = 5000  # train 6000, test 1000
+        self.nms_thresh = 0.7
+        self.min_size = 3.0
+        self.eta = 1.
+        self.pixel_offset = False
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -35,7 +35,8 @@ class TestROIAlignOp(OpTest):
            'spatial_scale': self.spatial_scale,
            'pooled_height': self.pooled_height,
            'pooled_width': self.pooled_width,
-            'sampling_ratio': self.sampling_ratio
+            'sampling_ratio': self.sampling_ratio,
+            'aligned': self.aligned,
        }
        self.outputs = {'Out': self.out_data}
@@ -53,6 +54,7 @@ class TestROIAlignOp(OpTest):
        self.pooled_height = 2
        self.pooled_width = 2
        self.sampling_ratio = -1
+        self.aligned = False
        self.x = np.random.random(self.x_dim).astype('float64')
@@ -115,16 +117,21 @@ class TestROIAlignOp(OpTest):
            (self.rois_num, self.channels, self.pooled_height,
             self.pooled_width)).astype('float64')
+        offset = 0.5 if self.aligned else 0.
        for i in range(self.rois_num):
            roi = self.rois[i]
            roi_batch_id = int(roi[0])
            x_i = self.x[roi_batch_id]
-            roi_xmin = roi[1] * self.spatial_scale
+            roi_xmin = roi[1] * self.spatial_scale - offset
-            roi_ymin = roi[2] * self.spatial_scale
+            roi_ymin = roi[2] * self.spatial_scale - offset
-            roi_xmax = roi[3] * self.spatial_scale
+            roi_xmax = roi[3] * self.spatial_scale - offset
-            roi_ymax = roi[4] * self.spatial_scale
+            roi_ymax = roi[4] * self.spatial_scale - offset
-            roi_width = max(roi_xmax - roi_xmin, 1)
-            roi_height = max(roi_ymax - roi_ymin, 1)
+            roi_width = roi_xmax - roi_xmin
+            roi_height = roi_ymax - roi_ymin
+            if not self.aligned:
+                roi_width = max(roi_width, 1)
+                roi_height = max(roi_height, 1)
            bin_size_h = float(roi_height) / float(self.pooled_height)
            bin_size_w = float(roi_width) / float(self.pooled_width)
            roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
@@ -192,11 +199,31 @@ class TestROIAlignInLodOp(TestROIAlignOp):
            'spatial_scale': self.spatial_scale,
            'pooled_height': self.pooled_height,
            'pooled_width': self.pooled_width,
-            'sampling_ratio': self.sampling_ratio
+            'sampling_ratio': self.sampling_ratio,
+            'aligned': self.aligned
        }
        self.outputs = {'Out': self.out_data}
+class TestROIAlignOpWithAligned(TestROIAlignOp):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.channels = 3
+        self.height = 8
+        self.width = 6
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
+        self.spatial_scale = 1.0 / 2.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.sampling_ratio = -1
+        self.aligned = True
+        self.x = np.random.random(self.x_dim).astype('float64')
 if __name__ == '__main__':
    unittest.main()