From 5b267474a90635d6e66bd4be936f340ca61a73aa Mon Sep 17 00:00:00 2001
From: Guanghua Yu <742925032@qq.com>
Date: Fri, 19 Feb 2021 10:32:11 +0800
Subject: [PATCH] add  offset parameter in roi_align,generate_proposals.etc ops
 (#30864)

* add  parameter in roi_align op
---
 .../fluid/operators/detection/bbox_util.cu.h  | 79 +++++++++++--------
 paddle/fluid/operators/detection/bbox_util.h  | 72 ++++++++++-------
 .../detection/distribute_fpn_proposals_op.cc  |  9 ++-
 .../detection/distribute_fpn_proposals_op.cu  |  9 ++-
 .../detection/distribute_fpn_proposals_op.h   | 11 +--
 .../detection/generate_proposals_v2_op.cc     | 31 +++++---
 .../detection/generate_proposals_v2_op.cu     | 19 +++--
 paddle/fluid/operators/detection/nms_util.h   |  8 +-
 paddle/fluid/operators/roi_align_op.cc        | 15 +++-
 paddle/fluid/operators/roi_align_op.cu        | 62 +++++++++------
 paddle/fluid/operators/roi_align_op.h         | 39 ++++++---
 .../test_distribute_fpn_proposals_op.py       | 25 ++++--
 .../unittests/test_generate_proposals_op.py   | 67 +++++++++-------
 .../test_generate_proposals_v2_op.py          | 52 ++++++++----
 .../tests/unittests/test_roi_align_op.py      | 43 ++++++++--
 15 files changed, 354 insertions(+), 187 deletions(-)
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index 8840765841d..0247093d03a 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -77,17 +77,20 @@ struct BoxDecodeAndClipFunctor {
   const T *var;
   const int *index;
   const T *im_info;
+  const bool pixel_offset;
 
   T *proposals;
 
   BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var,
-                          const int *index, const T *im_info, T *proposals)
+                          const int *index, const T *im_info, T *proposals,
+                          bool pixel_offset = true)
       : anchor(anchor),
         deltas(deltas),
         var(var),
         index(index),
         im_info(im_info),
-        proposals(proposals) {}
+        proposals(proposals),
+        pixel_offset(pixel_offset) {}
 
   T bbox_clip_default{static_cast<T>(kBBoxClipDefault)};
 
@@ -98,8 +101,9 @@ struct BoxDecodeAndClipFunctor {
     T axmax = anchor[k + 2];
     T aymax = anchor[k + 3];
 
-    T w = axmax - axmin + 1.0;
-    T h = aymax - aymin + 1.0;
+    T offset = pixel_offset ? static_cast<T>(1.0) : 0;
+    T w = axmax - axmin + offset;
+    T h = aymax - aymin + offset;
     T cx = axmin + 0.5 * w;
     T cy = aymin + 0.5 * h;
 
@@ -123,13 +127,13 @@ struct BoxDecodeAndClipFunctor {
 
     T oxmin = d_cx - d_w * 0.5;
     T oymin = d_cy - d_h * 0.5;
-    T oxmax = d_cx + d_w * 0.5 - 1.;
-    T oymax = d_cy + d_h * 0.5 - 1.;
+    T oxmax = d_cx + d_w * 0.5 - offset;
+    T oymax = d_cy + d_h * 0.5 - offset;
 
-    proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.);
-    proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.);
-    proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.);
-    proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.);
+    proposals[i * 4] = Max(Min(oxmin, im_info[1] - offset), 0.);
+    proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - offset), 0.);
+    proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - offset), 0.);
+    proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - offset), 0.);
   }
 
   __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; }
@@ -141,7 +145,8 @@ template <typename T, int BlockSize>
 static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
                                     const T min_size, const int num,
                                     int *keep_num, int *keep,
-                                    bool is_scale = true) {
+                                    bool is_scale = true,
+                                    bool pixel_offset = true) {
   T im_h = im_info[0];
   T im_w = im_info[1];
 
@@ -157,19 +162,25 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
     T ymin = bboxes[k + 1];
     T xmax = bboxes[k + 2];
     T ymax = bboxes[k + 3];
+    T offset = pixel_offset ? static_cast<T>(1.0) : 0;
+    T w = xmax - xmin + offset;
+    T h = ymax - ymin + offset;
+    if (pixel_offset) {
+      T cx = xmin + w / 2.;
+      T cy = ymin + h / 2.;
+
+      if (is_scale) {
+        w = (xmax - xmin) / im_info[2] + 1.;
+        h = (ymax - ymin) / im_info[2] + 1.;
+      }
 
-    T w = xmax - xmin + 1.0;
-    T h = ymax - ymin + 1.0;
-    T cx = xmin + w / 2.;
-    T cy = ymin + h / 2.;
-
-    if (is_scale) {
-      w = (xmax - xmin) / im_info[2] + 1.;
-      h = (ymax - ymin) / im_info[2] + 1.;
-    }
-
-    if (w >= min_size && h >= min_size && cx <= im_w && cy <= im_h) {
-      keep_index[threadIdx.x] = i;
+      if (w >= min_size && h >= min_size && cx <= im_w && cy <= im_h) {
+        keep_index[threadIdx.x] = i;
+      }
+    } else {
+      if (w >= min_size && h >= min_size) {
+        keep_index[threadIdx.x] = i;
+      }
     }
     __syncthreads();
     if (threadIdx.x == 0) {
@@ -187,19 +198,23 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
   }
 }
 
-static __device__ float IoU(const float *a, const float *b) {
+static __device__ float IoU(const float *a, const float *b,
+                            const bool pixel_offset = true) {
+  float offset = pixel_offset ? static_cast<float>(1.0) : 0;
   float left = max(a[0], b[0]), right = min(a[2], b[2]);
   float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float width = max(right - left + offset, 0.f),
+        height = max(bottom - top + offset, 0.f);
   float inter_s = width * height;
-  float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  float s_a = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
+  float s_b = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
   return inter_s / (s_a + s_b - inter_s);
 }
 
 static __global__ void NMSKernel(const int n_boxes,
                                  const float nms_overlap_thresh,
-                                 const float *dev_boxes, uint64_t *dev_mask) {
+                                 const float *dev_boxes, uint64_t *dev_mask,
+                                 bool pixel_offset = true) {
   const int row_start = blockIdx.y;
   const int col_start = blockIdx.x;
 
@@ -231,7 +246,8 @@ static __global__ void NMSKernel(const int n_boxes,
       start = threadIdx.x + 1;
     }
     for (i = start; i < col_size; i++) {
-      if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
+      if (IoU(cur_box, block_boxes + i * 4, pixel_offset) >
+          nms_overlap_thresh) {
         t |= 1ULL << i;
       }
     }
@@ -243,7 +259,7 @@ static __global__ void NMSKernel(const int n_boxes,
 template <typename T>
 static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
                 const Tensor &sorted_indices, const T nms_threshold,
-                Tensor *keep_out) {
+                Tensor *keep_out, bool pixel_offset = true) {
   int boxes_num = proposals.dims()[0];
   const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
   dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
@@ -255,7 +271,8 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
   framework::Vector<uint64_t> mask(boxes_num * col_blocks);
   NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes,
                                  mask.CUDAMutableData(BOOST_GET_CONST(
-                                     platform::CUDAPlace, ctx.GetPlace())));
+                                     platform::CUDAPlace, ctx.GetPlace())),
+                                 pixel_offset);
 
   std::vector<uint64_t> remv(col_blocks);
   memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index b7a23c48fb8..b262f05d6b1 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -31,7 +31,7 @@ struct RangeInitFunctor {
 };
 
 template <typename T>
-inline HOSTDEVICE T RoIArea(const T* box, bool normalized) {
+inline HOSTDEVICE T RoIArea(const T* box, bool pixel_offset = true) {
   if (box[2] < box[0] || box[3] < box[1]) {
     // If coordinate values are is invalid
     // (e.g. xmax < xmin or ymax < ymin), return 0.
@@ -39,11 +39,11 @@ inline HOSTDEVICE T RoIArea(const T* box, bool normalized) {
   } else {
     const T w = box[2] - box[0];
     const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
+    if (pixel_offset) {
       // If coordinate values are not within range [0, 1].
       return (w + 1) * (h + 1);
+    } else {
+      return w * h;
     }
   }
 }
@@ -157,10 +157,12 @@ template <class T>
 void ClipTiledBoxes(const platform::DeviceContext& ctx,
                     const framework::Tensor& im_info,
                     const framework::Tensor& input_boxes,
-                    framework::Tensor* out, bool is_scale = true) {
+                    framework::Tensor* out, bool is_scale = true,
+                    bool pixel_offset = true) {
   T* out_data = out->mutable_data<T>(ctx.GetPlace());
   const T* im_info_data = im_info.data<T>();
   const T* input_boxes_data = input_boxes.data<T>();
+  T offset = pixel_offset ? static_cast<T>(1.0) : 0;
   T zero(0);
   T im_w =
       is_scale ? round(im_info_data[1] / im_info_data[2]) : im_info_data[1];
@@ -168,13 +170,17 @@ void ClipTiledBoxes(const platform::DeviceContext& ctx,
       is_scale ? round(im_info_data[0] / im_info_data[2]) : im_info_data[0];
   for (int64_t i = 0; i < input_boxes.numel(); ++i) {
     if (i % 4 == 0) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
+      out_data[i] =
+          std::max(std::min(input_boxes_data[i], im_w - offset), zero);
     } else if (i % 4 == 1) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
+      out_data[i] =
+          std::max(std::min(input_boxes_data[i], im_h - offset), zero);
     } else if (i % 4 == 2) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
+      out_data[i] =
+          std::max(std::min(input_boxes_data[i], im_w - offset), zero);
     } else {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
+      out_data[i] =
+          std::max(std::min(input_boxes_data[i], im_h - offset), zero);
     }
   }
 }
@@ -184,29 +190,35 @@ template <class T>
 void FilterBoxes(const platform::DeviceContext& ctx,
                  const framework::Tensor* boxes, float min_size,
                  const framework::Tensor& im_info, bool is_scale,
-                 framework::Tensor* keep) {
+                 framework::Tensor* keep, bool pixel_offset = true) {
   const T* im_info_data = im_info.data<T>();
   const T* boxes_data = boxes->data<T>();
   keep->Resize({boxes->dims()[0]});
   min_size = std::max(min_size, 1.0f);
   int* keep_data = keep->mutable_data<int>(ctx.GetPlace());
+  T offset = pixel_offset ? static_cast<T>(1.0) : 0;
 
   int keep_len = 0;
   for (int i = 0; i < boxes->dims()[0]; ++i) {
-    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
-    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
-    T x_ctr = boxes_data[4 * i] + ws / 2;
-    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
+    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + offset;
+    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + offset;
+    if (pixel_offset) {
+      T x_ctr = boxes_data[4 * i] + ws / 2;
+      T y_ctr = boxes_data[4 * i + 1] + hs / 2;
 
-    if (is_scale) {
-      ws = (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_info_data[2] + 1;
-      hs =
-          (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_info_data[2] + 1;
-    }
-
-    if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] &&
-        y_ctr <= im_info_data[0]) {
-      keep_data[keep_len++] = i;
+      if (is_scale) {
+        ws = (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_info_data[2] + 1;
+        hs = (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_info_data[2] +
+             1;
+      }
+      if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] &&
+          y_ctr <= im_info_data[0]) {
+        keep_data[keep_len++] = i;
+      }
+    } else {
+      if (ws >= min_size && hs >= min_size) {
+        keep_data[keep_len++] = i;
+      }
     }
   }
   keep->Resize({keep_len});
@@ -216,8 +228,8 @@ template <class T>
 static void BoxCoder(const platform::DeviceContext& ctx,
                      framework::Tensor* all_anchors,
                      framework::Tensor* bbox_deltas,
-                     framework::Tensor* variances,
-                     framework::Tensor* proposals) {
+                     framework::Tensor* variances, framework::Tensor* proposals,
+                     const bool pixel_offset = true) {
   T* proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
 
   int64_t row = all_anchors->dims()[0];
@@ -230,9 +242,11 @@ static void BoxCoder(const platform::DeviceContext& ctx,
     variances_data = variances->data<T>();
   }
 
+  T offset = pixel_offset ? static_cast<T>(1.0) : 0;
   for (int64_t i = 0; i < row; ++i) {
-    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
-    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
+    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + offset;
+    T anchor_height =
+        anchor_data[i * len + 3] - anchor_data[i * len + 1] + offset;
 
     T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
     T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
@@ -270,8 +284,8 @@ static void BoxCoder(const platform::DeviceContext& ctx,
 
     proposals_data[i * len] = bbox_center_x - bbox_width / 2;
     proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
+    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - offset;
+    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - offset;
   }
   // return proposals;
 }
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
index b0c9d968e47..4e514e62f40 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
@@ -103,6 +103,9 @@ class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("refer_scale",
                  "The referring scale of FPN layer with"
                  " specified level");
+    AddAttr<bool>("pixel_offset", "(bool, default True),",
+                  "If true, im_shape pixel offset is 1.")
+        .SetDefault(true);
     AddComment(R"DOC(
 This operator distribute all proposals into different fpn level,
  with respect to scale of the proposals, the referring scale and
@@ -134,4 +137,8 @@ REGISTER_OP_VERSION(distribute_fpn_proposals)
             .NewOutput("MultiLevelRoisNum",
                        "The RoIs' number of each image on multiple "
                        "levels. The number on each level has the shape of (B),"
-                       "B is the number of images."));
+                       "B is the number of images."))
+    .AddCheckpoint(
+        R"ROC(Register distribute_fpn_proposals for adding the attribute of pixel_offset)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "pixel_offset", "If true, im_shape pixel offset is 1.", true));
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 27c06a0f8fb..7550ff91fd5 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -43,15 +43,15 @@ __global__ void GPUDistFpnProposalsHelper(
     const int nthreads, const T* rois, const int lod_size,
     const int refer_level, const int refer_scale, const int max_level,
     const int min_level, int* roi_batch_id_data, int* sub_lod_list,
-    int* target_lvls) {
+    int* target_lvls, bool pixel_offset = true) {
   CUDA_KERNEL_LOOP(i, nthreads) {
     const T* offset_roi = rois + i * BBoxSize;
     int roi_batch_ind = roi_batch_id_data[i];
     // get the target level of current rois
-    T roi_area = RoIArea(offset_roi, false);
+    T roi_area = RoIArea(offset_roi, pixel_offset);
     T roi_scale = sqrt(roi_area);
     int tgt_lvl = floor(
-        log2(roi_scale / static_cast<T>(refer_scale) + (T)1e-6) + refer_level);
+        log2(roi_scale / static_cast<T>(refer_scale) + (T)1e-8) + refer_level);
     tgt_lvl = min(max_level, max(tgt_lvl, min_level));
     target_lvls[i] = tgt_lvl;
     // compute number of rois in the same batch and same target level
@@ -73,6 +73,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     const int max_level = ctx.Attr<int>("max_level");
     const int refer_level = ctx.Attr<int>("refer_level");
     const int refer_scale = ctx.Attr<int>("refer_scale");
+    const bool pixel_offset = ctx.Attr<bool>("pixel_offset");
     int num_level = max_level - min_level + 1;
 
     // check that the fpn_rois is not empty
@@ -126,7 +127,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     GPUDistFpnProposalsHelper<T><<<dist_blocks, threads>>>(
         roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
         max_level, min_level, roi_batch_id_list_gpu.data<int>(),
-        sub_lod_list_data, target_lvls_data);
+        sub_lod_list_data, target_lvls_data, pixel_offset);
     dev_ctx.Wait();
     auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index 465435637cf..e3c125b0a68 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -44,7 +44,7 @@ inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
 }
 
 template <typename T>
-static inline T BBoxArea(const T* box, bool normalized) {
+static inline T BBoxArea(const T* box, bool pixel_offset) {
   if (box[2] < box[0] || box[3] < box[1]) {
     // If coordinate values are is invalid
     // (e.g. xmax < xmin or ymax < ymin), return 0.
@@ -52,11 +52,11 @@ static inline T BBoxArea(const T* box, bool normalized) {
   } else {
     const T w = box[2] - box[0];
     const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
+    if (pixel_offset) {
       // If coordinate values are not within range [0, 1].
       return (w + 1) * (h + 1);
+    } else {
+      return w * h;
     }
   }
 }
@@ -77,6 +77,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     const int max_level = context.Attr<int>("max_level");
     const int refer_level = context.Attr<int>("refer_level");
     const int refer_scale = context.Attr<int>("refer_scale");
+    const bool pixel_offset = context.Attr<bool>("pixel_offset");
     const int num_level = max_level - min_level + 1;
 
     // check that the fpn_rois is not empty
@@ -108,7 +109,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
       const T* rois_data = fpn_rois_slice.data<T>();
       for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
         // get the target level of current rois
-        T roi_scale = std::sqrt(BBoxArea(rois_data, false));
+        T roi_scale = std::sqrt(BBoxArea(rois_data, pixel_offset));
         int tgt_lvl = std::floor(std::log2(roi_scale / refer_scale + (T)1e-6) +
                                  refer_level);
         tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 7c2fd599fa6..44554a941dc 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -87,6 +87,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     float nms_thresh = context.Attr<float>("nms_thresh");
     float min_size = context.Attr<float>("min_size");
     float eta = context.Attr<float>("eta");
+    bool pixel_offset = context.Attr<bool>("pixel_offset");
 
     auto &dev_ctx =
         context.template device_context<platform::CPUDeviceContext>();
@@ -134,10 +135,10 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
       bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
       scores_slice.Resize({h_score * w_score * c_score, 1});
 
-      std::pair<Tensor, Tensor> tensor_pair =
-          ProposalForOneImage(dev_ctx, im_shape_slice, anchors, variances,
-                              bbox_deltas_slice, scores_slice, pre_nms_top_n,
-                              post_nms_top_n, nms_thresh, min_size, eta);
+      std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage(
+          dev_ctx, im_shape_slice, anchors, variances, bbox_deltas_slice,
+          scores_slice, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size,
+          eta, pixel_offset);
       Tensor &proposals = tensor_pair.first;
       Tensor &scores = tensor_pair.second;
 
@@ -168,7 +169,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
       const Tensor &bbox_deltas_slice,  // [M, 4]
       const Tensor &scores_slice,       // [N, 1]
       int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
-      float eta) const {
+      float eta, bool pixel_offset = true) const {
     auto *scores_data = scores_slice.data<T>();
 
     // Sort index
@@ -203,12 +204,15 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
 
     Tensor proposals;
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
-    BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);
+    BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals,
+                pixel_offset);
 
-    ClipTiledBoxes<T>(ctx, im_shape_slice, proposals, &proposals, false);
+    ClipTiledBoxes<T>(ctx, im_shape_slice, proposals, &proposals, false,
+                      pixel_offset);
 
     Tensor keep;
-    FilterBoxes<T>(ctx, &proposals, min_size, im_shape_slice, false, &keep);
+    FilterBoxes<T>(ctx, &proposals, min_size, im_shape_slice, false, &keep,
+                   pixel_offset);
     // Handle the case when there is no keep index left
     if (keep.numel() == 0) {
       math::SetConstant<platform::CPUDeviceContext, T> set_zero;
@@ -229,7 +233,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
       return std::make_pair(bbox_sel, scores_filter);
     }
 
-    Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
+    Tensor keep_nms =
+        NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta, pixel_offset);
 
     if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
       keep_nms.Resize({post_nms_top_n});
@@ -280,6 +285,9 @@ class GenerateProposalsV2OpMaker : public framework::OpProtoAndCheckerMaker {
                    "Proposal height and width both need to be greater "
                    "than this min_size.");
     AddAttr<float>("eta", "The parameter for adaptive NMS.");
+    AddAttr<bool>("pixel_offset", "(bool, default True),",
+                  "If true, im_shape pixel offset is 1.")
+        .SetDefault(true);
     AddComment(R"DOC(
 This operator is the second version of generate_proposals op to generate 
 bounding box proposals for Faster RCNN.
@@ -312,3 +320,8 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(generate_proposals_v2,
                        ops::GenerateProposalsV2Kernel<float>,
                        ops::GenerateProposalsV2Kernel<double>);
+REGISTER_OP_VERSION(generate_proposals_v2)
+    .AddCheckpoint(
+        R"ROC(Registe generate_proposals_v2 for adding the attribute of pixel_offset)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "pixel_offset", "If true, im_shape pixel offset is 1.", true));
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
index 70020cdc64e..6244827f685 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -36,7 +36,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
     const Tensor &bbox_deltas,  // [M, 4]
     const Tensor &scores,       // [N, 1]
     int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
-    float eta) {
+    float eta, bool pixel_offset) {
   // 1. pre nms
   Tensor scores_sort, index_sort;
   SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
@@ -54,7 +54,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
     platform::ForRange<platform::CUDADeviceContext> for_range(ctx, pre_nms_num);
     for_range(BoxDecodeAndClipFunctor<T>{
         anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
-        index_sort.data<int>(), im_shape.data<T>(), proposals.data<T>()});
+        index_sort.data<int>(), im_shape.data<T>(), proposals.data<T>(),
+        pixel_offset});
   }
 
   // 3. filter
@@ -65,7 +66,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   auto stream = ctx.stream();
   FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
       proposals.data<T>(), im_shape.data<T>(), min_size, pre_nms_num,
-      keep_num_t.data<int>(), keep_index.data<int>(), false);
+      keep_num_t.data<int>(), keep_index.data<int>(), false, pixel_offset);
   int keep_num;
   const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
   memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
@@ -94,7 +95,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
 
   // 4. nms
   Tensor keep_nms;
-  NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
+  NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms,
+         pixel_offset);
   if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
     keep_nms.Resize({post_nms_top_n});
   }
@@ -129,6 +131,7 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel<T> {
     float nms_thresh = context.Attr<float>("nms_thresh");
     float min_size = context.Attr<float>("min_size");
     float eta = context.Attr<float>("eta");
+    bool pixel_offset = context.Attr<bool>("pixel_offset");
     PADDLE_ENFORCE_GE(eta, 1.,
                       platform::errors::InvalidArgument(
                           "Not support adaptive NMS. The attribute 'eta' "
@@ -184,10 +187,10 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel<T> {
       bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
       scores_slice.Resize({h_score * w_score * c_score, 1});
 
-      std::pair<Tensor, Tensor> box_score_pair =
-          ProposalForOneImage<T>(dev_ctx, im_shape_slice, anchors, variances,
-                                 bbox_deltas_slice, scores_slice, pre_nms_top_n,
-                                 post_nms_top_n, nms_thresh, min_size, eta);
+      std::pair<Tensor, Tensor> box_score_pair = ProposalForOneImage<T>(
+          dev_ctx, im_shape_slice, anchors, variances, bbox_deltas_slice,
+          scores_slice, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size,
+          eta, pixel_offset);
 
       Tensor &proposals = box_score_pair.first;
       Tensor &scores = box_score_pair.second;
diff --git a/paddle/fluid/operators/detection/nms_util.h b/paddle/fluid/operators/detection/nms_util.h
index febdee82635..0e448d42fc2 100644
--- a/paddle/fluid/operators/detection/nms_util.h
+++ b/paddle/fluid/operators/detection/nms_util.h
@@ -130,7 +130,7 @@ static inline framework::Tensor VectorToTensor(
 template <class T>
 framework::Tensor NMS(const platform::DeviceContext& ctx,
                       framework::Tensor* bbox, framework::Tensor* scores,
-                      T nms_threshold, float eta) {
+                      T nms_threshold, float eta, bool pixel_offset = true) {
   int64_t num_boxes = bbox->dims()[0];
   // 4: [xmin ymin xmax ymax]
   int64_t box_size = bbox->dims()[1];
@@ -144,13 +144,15 @@ framework::Tensor NMS(const platform::DeviceContext& ctx,
   int selected_num = 0;
   T adaptive_threshold = nms_threshold;
   const T* bbox_data = bbox->data<T>();
+  bool normalized = pixel_offset ? false : true;
   while (sorted_indices.size() != 0) {
     int idx = sorted_indices.back().second;
     bool flag = true;
     for (int kept_idx : selected_indices) {
       if (flag) {
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, false);
+        T overlap =
+            JaccardOverlap<T>(bbox_data + idx * box_size,
+                              bbox_data + kept_idx * box_size, normalized);
         flag = (overlap <= adaptive_threshold);
       } else {
         break;
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 6a4a88a0045..5627b4f229e 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -175,6 +175,10 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
                  "If <=0, then grid points are adaptive to roi_width "
                  "and pooled_w, likewise for height")
         .SetDefault(-1);
+    AddAttr<bool>("aligned",
+                  "(bool, default False),"
+                  "If true, pixel shift it by -0.5 for align more perfectly")
+        .SetDefault(false);
     AddComment(R"DOC(
 **RoIAlign Operator**
 
@@ -242,7 +246,14 @@ REGISTER_OP_VERSION(roi_align)
             "it is not used in object detection models yet."))
     .AddCheckpoint(
         R"ROC(
-              Upgrade roi_align add a new input [RoisNum])ROC",
+             Upgrade roi_align add a new input [RoisNum])ROC",
         paddle::framework::compatible::OpVersionDesc().NewInput(
             "RoisNum",
-            "The number of RoIs in each image. RoisNum is dispensable."));
+            "The number of RoIs in each image. RoisNum is dispensable."))
+    .AddCheckpoint(
+        R"ROC(
+             Upgrade roi_align add a new input [aligned])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "aligned",
+            "If true, pixel shift it by -0.5 for align more perfectly.",
+            false));
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index 3a4ce55f4fb..074a00fb1c3 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -105,7 +105,8 @@ __global__ void GPUROIAlignForward(
     const int nthreads, const T* input_data, const T* input_rois,
     const float spatial_scale, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
-    const int sampling_ratio, int* roi_batch_id_data, T* output_data) {
+    const int sampling_ratio, int* roi_batch_id_data, T* output_data,
+    const bool continuous_coordinate) {
   CUDA_KERNEL_LOOP(i, nthreads) {
     int pw = i % pooled_width;
     int ph = (i / pooled_width) % pooled_height;
@@ -115,13 +116,19 @@ __global__ void GPUROIAlignForward(
     const T* offset_input_rois = input_rois + n * kROISize;
     int roi_batch_ind = roi_batch_id_data[n];
 
-    T roi_xmin = offset_input_rois[0] * spatial_scale;
-    T roi_ymin = offset_input_rois[1] * spatial_scale;
-    T roi_xmax = offset_input_rois[2] * spatial_scale;
-    T roi_ymax = offset_input_rois[3] * spatial_scale;
+    T roi_offset = continuous_coordinate ? static_cast<T>(0.5) : 0;
+    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
+    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
+    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
+    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
 
-    T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.));
-    T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.));
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
@@ -153,14 +160,12 @@ __global__ void GPUROIAlignForward(
 }
 
 template <typename T>
-__global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois,
-                                    const T* out_grad, const int num_rois,
-                                    const float spatial_scale,
-                                    const int channels, const int height,
-                                    const int width, const int pooled_height,
-                                    const int pooled_width,
-                                    const int sampling_ratio,
-                                    int* roi_batch_id_data, T* input_grad) {
+__global__ void GPUROIAlignBackward(
+    const int nthreads, const T* input_rois, const T* out_grad,
+    const int num_rois, const float spatial_scale, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int sampling_ratio, int* roi_batch_id_data,
+    T* input_grad, const bool continuous_coordinate) {
   CUDA_KERNEL_LOOP(i, nthreads) {
     int pw = i % pooled_width;
     int ph = (i / pooled_width) % pooled_height;
@@ -169,13 +174,18 @@ __global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois,
     const T* offset_input_rois = input_rois + n * kROISize;
     int roi_batch_ind = roi_batch_id_data[n];
 
-    T roi_xmin = offset_input_rois[0] * spatial_scale;
-    T roi_ymin = offset_input_rois[1] * spatial_scale;
-    T roi_xmax = offset_input_rois[2] * spatial_scale;
-    T roi_ymax = offset_input_rois[3] * spatial_scale;
-
-    T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.));
-    T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.));
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
+    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
+    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
+    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
+    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
@@ -236,6 +246,7 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
     auto pooled_width = ctx.Attr<int>("pooled_width");
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto aligned = ctx.Attr<bool>("aligned");
 
     auto in_dims = in->dims();
     int batch_size = in_dims[0];
@@ -316,7 +327,7 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
     GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
         output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
         height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()));
+        out->mutable_data<T>(ctx.GetPlace()), aligned);
   }
 };
 
@@ -334,6 +345,7 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     auto pooled_width = ctx.Attr<int>("pooled_width");
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto aligned = ctx.Attr<bool>("aligned");
 
     int rois_num = rois->dims()[0];
     int channels = in->dims()[1];
@@ -390,8 +402,8 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
       GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
           output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
           spatial_scale, channels, height, width, pooled_height, pooled_width,
-          sampling_ratio, roi_id_data,
-          in_grad->mutable_data<T>(ctx.GetPlace()));
+          sampling_ratio, roi_id_data, in_grad->mutable_data<T>(ctx.GetPlace()),
+          aligned);
     }
   }
 };
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
index 066125a92fb..d03cd617e6d 100644
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -145,6 +145,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
     auto pooled_width = ctx.Attr<int>("pooled_width");
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto aligned = ctx.Attr<bool>("aligned");
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
@@ -215,15 +216,21 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
     }
     T* output_data = out->mutable_data<T>(ctx.GetPlace());
     const T* rois_data = rois->data<T>();
+    T roi_offset = aligned ? T(0.5) : 0;
     for (int n = 0; n < rois_num; ++n) {
       int roi_batch_id = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale;
-      T roi_ymin = rois_data[1] * spatial_scale;
-      T roi_xmax = rois_data[2] * spatial_scale;
-      T roi_ymax = rois_data[3] * spatial_scale;
+      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
+      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
+      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
+      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
+
+      T roi_width = roi_xmax - roi_xmin;
+      T roi_height = roi_ymax - roi_ymin;
+      if (!aligned) {
+        roi_width = std::max(roi_width, static_cast<T>(1.));
+        roi_height = std::max(roi_height, static_cast<T>(1.));
+      }
 
-      T roi_width = std::max(roi_xmax - roi_xmin, static_cast<T>(1.));
-      T roi_height = std::max(roi_ymax - roi_ymin, static_cast<T>(1.));
       T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
       T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
       const T* batch_data = input_data + roi_batch_id * in_stride[0];
@@ -290,6 +297,7 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
     auto in_dims = in->dims();
+    auto aligned = ctx.Attr<bool>("aligned");
 
     int channels = in_dims[1];
     int height = in_dims[2];
@@ -344,14 +352,21 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     auto roi_stride = framework::stride(rois->dims());
     auto out_stride = framework::stride(out_grad->dims());
 
+    T roi_offset = aligned ? T(0.5) : 0;
     for (int n = 0; n < rois_num; ++n) {
       int roi_batch_idx = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale;
-      T roi_ymin = rois_data[1] * spatial_scale;
-      T roi_xmax = rois_data[2] * spatial_scale;
-      T roi_ymax = rois_data[3] * spatial_scale;
-      T roi_width = std::max(roi_xmax - roi_xmin, static_cast<T>(1.));
-      T roi_height = std::max(roi_ymax - roi_ymin, static_cast<T>(1.));
+      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
+      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
+      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
+      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
+
+      T roi_width = roi_xmax - roi_xmin;
+      T roi_height = roi_ymax - roi_ymin;
+
+      if (!aligned) {
+        roi_width = std::max(roi_width, static_cast<T>(1.));
+        roi_height = std::max(roi_height, static_cast<T>(1.));
+      }
       T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
       T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
       for (int c = 0; c < channels; ++c) {
diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
index ec0125b28ed..2cd7889d6e3 100644
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
@@ -31,7 +31,8 @@ class TestDistributeFPNProposalsOp(OpTest):
             'max_level': self.roi_max_level,
             'min_level': self.roi_min_level,
             'refer_scale': self.canonical_scale,
-            'refer_level': self.canonical_level
+            'refer_level': self.canonical_level,
+            'pixel_offset': self.pixel_offset,
         }
         output = [('out%d' % i, self.rois_fpn[i])
                   for i in range(len(self.rois_fpn))]
@@ -47,10 +48,12 @@ class TestDistributeFPNProposalsOp(OpTest):
         self.canonical_scale = 224
         self.canonical_level = 4
         self.images_shape = [512, 512]
+        self.pixel_offset = True
 
     def boxes_area(self, boxes):
-        w = (boxes[:, 2] - boxes[:, 0] + 1)
-        h = (boxes[:, 3] - boxes[:, 1] + 1)
+        offset = 1 if self.pixel_offset else 0
+        w = (boxes[:, 2] - boxes[:, 0] + offset)
+        h = (boxes[:, 3] - boxes[:, 1] + offset)
         areas = w * h
         assert np.all(areas >= 0), 'Negative areas founds'
         return areas
@@ -59,7 +62,7 @@ class TestDistributeFPNProposalsOp(OpTest):
         s = np.sqrt(self.boxes_area(rois))
         s0 = self.canonical_scale
         lvl0 = self.canonical_level
-        target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-6))
+        target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-8))
         target_lvls = np.clip(target_lvls, lvl_min, lvl_max)
         return target_lvls
 
@@ -131,7 +134,8 @@ class TestDistributeFPNProposalsOpWithRoisNum(TestDistributeFPNProposalsOp):
             'max_level': self.roi_max_level,
             'min_level': self.roi_min_level,
             'refer_scale': self.canonical_scale,
-            'refer_level': self.canonical_level
+            'refer_level': self.canonical_level,
+            'pixel_offset': self.pixel_offset,
         }
         output = [('out%d' % i, self.rois_fpn[i])
                   for i in range(len(self.rois_fpn))]
@@ -147,5 +151,16 @@ class TestDistributeFPNProposalsOpWithRoisNum(TestDistributeFPNProposalsOp):
         }
 
 
+class TestDistributeFPNProposalsOpNoOffset(
+        TestDistributeFPNProposalsOpWithRoisNum):
+    def init_test_case(self):
+        self.roi_max_level = 5
+        self.roi_min_level = 2
+        self.canonical_scale = 224
+        self.canonical_level = 4
+        self.images_shape = [512, 512]
+        self.pixel_offset = False
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index 8304016d7d0..6b9eeaa0867 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -21,7 +21,6 @@ import math
 import paddle
 import paddle.fluid as fluid
 from op_test import OpTest
-from test_multiclass_nms_op import nms
 from test_anchor_generator_op import anchor_generator_in_python
 import copy
 
@@ -111,18 +110,19 @@ def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
     return proposals, scores
 
 
-def box_coder(all_anchors, bbox_deltas, variances):
+def box_coder(all_anchors, bbox_deltas, variances, pixel_offset=True):
     """
     Decode proposals by anchors and bbox_deltas from RPN 
     """
+    offset = 1 if pixel_offset else 0
     #proposals: xmin, ymin, xmax, ymax
     proposals = np.zeros_like(bbox_deltas, dtype=np.float32)
 
     #anchor_loc: width, height, center_x, center_y
     anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)
 
-    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1
-    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1
+    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + offset
+    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + offset
     anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
     anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]
 
@@ -152,51 +152,60 @@ def box_coder(all_anchors, bbox_deltas, variances):
             pred_bbox[i, 3] = math.exp(
                 min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i,
                                                                             1]
-
     proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
     proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
-    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1
-    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1
+    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - offset
+    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - offset
 
     return proposals
 
 
-def clip_tiled_boxes(boxes, im_shape):
+def clip_tiled_boxes(boxes, im_shape, pixel_offset=True):
     """Clip boxes to image boundaries. im_shape is [height, width] and boxes
     has shape (N, 4 * num_tiled_boxes)."""
     assert boxes.shape[1] % 4 == 0, \
         'boxes.shape[1] is {:d}, but must be divisible by 4.'.format(
         boxes.shape[1]
     )
+    offset = 1 if pixel_offset else 0
     # x1 >= 0
-    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    boxes[:, 0::4] = np.maximum(
+        np.minimum(boxes[:, 0::4], im_shape[1] - offset), 0)
     # y1 >= 0
-    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    boxes[:, 1::4] = np.maximum(
+        np.minimum(boxes[:, 1::4], im_shape[0] - offset), 0)
     # x2 < im_shape[1]
-    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    boxes[:, 2::4] = np.maximum(
+        np.minimum(boxes[:, 2::4], im_shape[1] - offset), 0)
     # y2 < im_shape[0]
-    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    boxes[:, 3::4] = np.maximum(
+        np.minimum(boxes[:, 3::4], im_shape[0] - offset), 0)
     return boxes
 
 
-def filter_boxes(boxes, min_size, im_info):
+def filter_boxes(boxes, min_size, im_info, pixel_offset=True):
     """Only keep boxes with both sides >= min_size and center within the image.
     """
     # Scale min_size to match image scale
     im_scale = im_info[2]
     min_size = max(min_size, 1.0)
-    ws = boxes[:, 2] - boxes[:, 0] + 1
-    hs = boxes[:, 3] - boxes[:, 1] + 1
-    ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1
-    hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
-    x_ctr = boxes[:, 0] + ws / 2.
-    y_ctr = boxes[:, 1] + hs / 2.
-    keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) &
-                    (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0]
+    offset = 1 if pixel_offset else 0
+    ws = boxes[:, 2] - boxes[:, 0] + offset
+    hs = boxes[:, 3] - boxes[:, 1] + offset
+    if pixel_offset:
+        ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1
+        hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
+        x_ctr = boxes[:, 0] + ws / 2.
+        y_ctr = boxes[:, 1] + hs / 2.
+        keep = np.where((ws_orig_scale >= min_size) & (
+            hs_orig_scale >= min_size) & (x_ctr < im_info[1]) & (y_ctr <
+                                                                 im_info[0]))[0]
+    else:
+        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
     return keep
 
 
-def iou(box_a, box_b):
+def iou(box_a, box_b, pixel_offset=True):
     """
 	Apply intersection-over-union overlap between box_a and box_b
     """
@@ -209,9 +218,9 @@ def iou(box_a, box_b):
     ymin_b = min(box_b[1], box_b[3])
     xmax_b = max(box_b[0], box_b[2])
     ymax_b = max(box_b[1], box_b[3])
-
-    area_a = (ymax_a - ymin_a + 1) * (xmax_a - xmin_a + 1)
-    area_b = (ymax_b - ymin_b + 1) * (xmax_b - xmin_b + 1)
+    offset = 1 if pixel_offset else 0
+    area_a = (ymax_a - ymin_a + offset) * (xmax_a - xmin_a + offset)
+    area_b = (ymax_b - ymin_b + offset) * (xmax_b - xmin_b + offset)
     if area_a <= 0 and area_b <= 0:
         return 0.0
 
@@ -220,14 +229,14 @@ def iou(box_a, box_b):
     xb = min(xmax_a, xmax_b)
     yb = min(ymax_a, ymax_b)
 
-    inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0)
+    inter_area = max(xb - xa + offset, 0.0) * max(yb - ya + offset, 0.0)
 
     iou_ratio = inter_area / (area_a + area_b - inter_area)
 
     return iou_ratio
 
 
-def nms(boxes, scores, nms_threshold, eta=1.0):
+def nms(boxes, scores, nms_threshold, eta=1.0, pixel_offset=True):
     """Apply non-maximum suppression at test time to avoid detecting too many
     overlapping bounding boxes for a given object.
     Args:
@@ -252,7 +261,9 @@ def nms(boxes, scores, nms_threshold, eta=1.0):
         for k in range(len(selected_indices)):
             if keep:
                 kept_idx = selected_indices[k]
-                overlap = iou(boxes[idx], boxes[kept_idx])
+                overlap = iou(boxes[idx],
+                              boxes[kept_idx],
+                              pixel_offset=pixel_offset)
                 keep = True if overlap <= adaptive_threshold else False
             else:
                 break
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py
index 26c443008db..0a670045187 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py
@@ -21,7 +21,6 @@ import math
 import paddle
 import paddle.fluid as fluid
 from op_test import OpTest
-from test_multiclass_nms_op import nms
 from test_anchor_generator_op import anchor_generator_in_python
 import copy
 from test_generate_proposals_op import clip_tiled_boxes, box_coder, nms
@@ -29,7 +28,7 @@ from test_generate_proposals_op import clip_tiled_boxes, box_coder, nms
 
 def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors,
                                     variances, pre_nms_topN, post_nms_topN,
-                                    nms_thresh, min_size, eta):
+                                    nms_thresh, min_size, eta, pixel_offset):
     all_anchors = anchors.reshape(-1, 4)
     rois = np.empty((0, 5), dtype=np.float32)
     roi_probs = np.empty((0, 1), dtype=np.float32)
@@ -42,7 +41,8 @@ def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors,
         img_i_boxes, img_i_probs = proposal_for_one_image(
             im_shape[img_idx, :], all_anchors, variances,
             bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :],
-            pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta)
+            pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta,
+            pixel_offset)
         rois_num.append(img_i_probs.shape[0])
         rpn_rois.append(img_i_boxes)
         rpn_roi_probs.append(img_i_probs)
@@ -52,7 +52,7 @@ def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors,
 
 def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas,
                            scores, pre_nms_topN, post_nms_topN, nms_thresh,
-                           min_size, eta):
+                           min_size, eta, pixel_offset):
     # Transpose and reshape predicted bbox transformations to get them
     # into the same order as the anchors:
     #   - bbox deltas will be (4 * A, H, W) format from conv output
@@ -83,12 +83,12 @@ def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas,
     scores = scores[order, :]
     bbox_deltas = bbox_deltas[order, :]
     all_anchors = all_anchors[order, :]
-    proposals = box_coder(all_anchors, bbox_deltas, variances)
+    proposals = box_coder(all_anchors, bbox_deltas, variances, pixel_offset)
     # clip proposals to image (may result in proposals with zero area
     # that will be removed in the next step)
-    proposals = clip_tiled_boxes(proposals, im_shape)
+    proposals = clip_tiled_boxes(proposals, im_shape, pixel_offset)
     # remove predicted boxes with height or width < min_size
-    keep = filter_boxes(proposals, min_size, im_shape)
+    keep = filter_boxes(proposals, min_size, im_shape, pixel_offset)
     if len(keep) == 0:
         proposals = np.zeros((1, 4)).astype('float32')
         scores = np.zeros((1, 1)).astype('float32')
@@ -103,7 +103,8 @@ def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas,
         keep = nms(boxes=proposals,
                    scores=scores,
                    nms_threshold=nms_thresh,
-                   eta=eta)
+                   eta=eta,
+                   pixel_offset=pixel_offset)
         if post_nms_topN > 0 and post_nms_topN < len(keep):
             keep = keep[:post_nms_topN]
         proposals = proposals[keep, :]
@@ -112,17 +113,21 @@ def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas,
     return proposals, scores
 
 
-def filter_boxes(boxes, min_size, im_shape):
+def filter_boxes(boxes, min_size, im_shape, pixel_offset=True):
     """Only keep boxes with both sides >= min_size and center within the image.
     """
     # Scale min_size to match image scale
     min_size = max(min_size, 1.0)
-    ws = boxes[:, 2] - boxes[:, 0] + 1
-    hs = boxes[:, 3] - boxes[:, 1] + 1
-    x_ctr = boxes[:, 0] + ws / 2.
-    y_ctr = boxes[:, 1] + hs / 2.
-    keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_shape[1])
-                    & (y_ctr < im_shape[0]))[0]
+    offset = 1 if pixel_offset else 0
+    ws = boxes[:, 2] - boxes[:, 0] + offset
+    hs = boxes[:, 3] - boxes[:, 1] + offset
+    if pixel_offset:
+        x_ctr = boxes[:, 0] + ws / 2.
+        y_ctr = boxes[:, 1] + hs / 2.
+        keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_shape[
+            1]) & (y_ctr < im_shape[0]))[0]
+    else:
+        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
     return keep
 
 
@@ -144,7 +149,8 @@ class TestGenerateProposalsV2Op(OpTest):
             'post_nms_topN': self.post_nms_topN,
             'nms_thresh': self.nms_thresh,
             'min_size': self.min_size,
-            'eta': self.eta
+            'eta': self.eta,
+            'pixel_offset': self.pixel_offset,
         }
 
         self.outputs = {
@@ -165,6 +171,7 @@ class TestGenerateProposalsV2Op(OpTest):
         self.nms_thresh = 0.7
         self.min_size = 3.0
         self.eta = 1.
+        self.pixel_offset = True
 
     def init_test_input(self):
         batch_size = 1
@@ -191,7 +198,7 @@ class TestGenerateProposalsV2Op(OpTest):
         self.rpn_rois, self.rpn_roi_probs, self.rois_num = generate_proposals_v2_in_python(
             self.scores, self.bbox_deltas, self.im_shape, self.anchors,
             self.variances, self.pre_nms_topN, self.post_nms_topN,
-            self.nms_thresh, self.min_size, self.eta)
+            self.nms_thresh, self.min_size, self.eta, self.pixel_offset)
 
 
 class TestGenerateProposalsV2OutLodOp(TestGenerateProposalsV2Op):
@@ -231,6 +238,17 @@ class TestGenerateProposalsV2OpNoBoxLeft(TestGenerateProposalsV2Op):
         self.nms_thresh = 0.7
         self.min_size = 1000.0
         self.eta = 1.
+        self.pixel_offset = True
+
+
+class TestGenerateProposalsV2OpNoOffset(TestGenerateProposalsV2Op):
+    def init_test_params(self):
+        self.pre_nms_topN = 12000  # train 12000, test 2000
+        self.post_nms_topN = 5000  # train 6000, test 1000
+        self.nms_thresh = 0.7
+        self.min_size = 3.0
+        self.eta = 1.
+        self.pixel_offset = False
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
index fb8a090b807..940a3e9f960 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -35,7 +35,8 @@ class TestROIAlignOp(OpTest):
             'spatial_scale': self.spatial_scale,
             'pooled_height': self.pooled_height,
             'pooled_width': self.pooled_width,
-            'sampling_ratio': self.sampling_ratio
+            'sampling_ratio': self.sampling_ratio,
+            'aligned': self.aligned,
         }
 
         self.outputs = {'Out': self.out_data}
@@ -53,6 +54,7 @@ class TestROIAlignOp(OpTest):
         self.pooled_height = 2
         self.pooled_width = 2
         self.sampling_ratio = -1
+        self.aligned = False
 
         self.x = np.random.random(self.x_dim).astype('float64')
 
@@ -115,16 +117,21 @@ class TestROIAlignOp(OpTest):
             (self.rois_num, self.channels, self.pooled_height,
              self.pooled_width)).astype('float64')
 
+        offset = 0.5 if self.aligned else 0.
         for i in range(self.rois_num):
             roi = self.rois[i]
             roi_batch_id = int(roi[0])
             x_i = self.x[roi_batch_id]
-            roi_xmin = roi[1] * self.spatial_scale
-            roi_ymin = roi[2] * self.spatial_scale
-            roi_xmax = roi[3] * self.spatial_scale
-            roi_ymax = roi[4] * self.spatial_scale
-            roi_width = max(roi_xmax - roi_xmin, 1)
-            roi_height = max(roi_ymax - roi_ymin, 1)
+            roi_xmin = roi[1] * self.spatial_scale - offset
+            roi_ymin = roi[2] * self.spatial_scale - offset
+            roi_xmax = roi[3] * self.spatial_scale - offset
+            roi_ymax = roi[4] * self.spatial_scale - offset
+
+            roi_width = roi_xmax - roi_xmin
+            roi_height = roi_ymax - roi_ymin
+            if not self.aligned:
+                roi_width = max(roi_width, 1)
+                roi_height = max(roi_height, 1)
             bin_size_h = float(roi_height) / float(self.pooled_height)
             bin_size_w = float(roi_width) / float(self.pooled_width)
             roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
@@ -192,11 +199,31 @@ class TestROIAlignInLodOp(TestROIAlignOp):
             'spatial_scale': self.spatial_scale,
             'pooled_height': self.pooled_height,
             'pooled_width': self.pooled_width,
-            'sampling_ratio': self.sampling_ratio
+            'sampling_ratio': self.sampling_ratio,
+            'aligned': self.aligned
         }
 
         self.outputs = {'Out': self.out_data}
 
 
+class TestROIAlignOpWithAligned(TestROIAlignOp):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.channels = 3
+        self.height = 8
+        self.width = 6
+
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
+
+        self.spatial_scale = 1.0 / 2.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.sampling_ratio = -1
+        self.aligned = True
+
+        self.x = np.random.random(self.x_dim).astype('float64')
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab