diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index f1c504d6e4bd065e4221b1207a117ff0f6732459..ffa6953a3d56aabb6db9cd98efe547b706fa1910 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -21,6 +21,7 @@ detection_library(iou_similarity_op SRCS iou_similarity_op.cc
 iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
+detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc poly_util.cc gpc.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
 detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee0708312ddbbe1b4390e3a96e533c2aa053e60a
--- /dev/null
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -0,0 +1,459 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/nms_util.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class LocalityAwareNMSOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput("BBoxes"), true,
+                      "Input(BBoxes) of MultiClassNMS should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Scores"), true,
+                      "Input(Scores) of MultiClassNMS should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      "Output(Out) of MultiClassNMS should not be null.");
+
+    auto box_dims = ctx->GetInputDim("BBoxes");
+    auto score_dims = ctx->GetInputDim("Scores");
+    auto score_size = score_dims.size();
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(score_size, 3, "The rank of Input(Scores) must be 3");
+      PADDLE_ENFORCE_EQ(box_dims.size(), 3,
+                        "The rank of Input(BBoxes) must be 3");
+
+      PADDLE_ENFORCE_EQ(box_dims[2] == 4 || box_dims[2] == 8 ||
+                            box_dims[2] == 16 || box_dims[2] == 24 ||
+                            box_dims[2] == 32,
+                        true,
+                        "The last dimension of Input(BBoxes) must be 4 or 8, "
+                        "represents the layout of coordinate "
+                        "[xmin, ymin, xmax, ymax] or "
+                        "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
+                        "8 points: [xi, yi] i= 1,2,...,8 or "
+                        "12 points: [xi, yi] i= 1,2,...,12 or "
+                        "16 points: [xi, yi] i= 1,2,...,16");
+      PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2],
+                        "The 2nd dimension of Input(BBoxes) must be equal to "
+                        "last dimension of Input(Scores), which represents the "
+                        "predicted bboxes.");
+    }
+    // Here the box_dims[0] is not the real dimension of output.
+    // It will be rewritten in the computing kernel.
+    ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Scores"),
+        platform::CPUPlace());
+  }
+};
+
+template <class T>
+void PolyWeightedMerge(const T* box1, T* box2, const T score1, const T score2,
+                       const size_t box_size) {
+  for (size_t i = 0; i < box_size; ++i) {
+    box2[i] = (box1[i] * score1 + box2[i] * score2) / (score1 + score2);
+  }
+}
+
+template <class T>
+void GetMaxScoreIndexWithLocalityAware(
+    T* scores, T* bbox_data, int64_t box_size, const T threshold, int top_k,
+    int64_t num_boxes, std::vector<std::pair<T, int>>* sorted_indices,
+    const T nms_threshold, const bool normalized) {
+  std::vector<bool> skip(num_boxes, true);
+  int index = -1;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (index > -1) {
+      T overlap = T(0.);
+      if (box_size == 4) {
+        overlap = JaccardOverlap<T>(bbox_data + i * box_size,
+                                    bbox_data + index * box_size, normalized);
+      }
+      // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
+      if (box_size == 8 || box_size == 16 || box_size == 24 || box_size == 32) {
+        overlap =
+            PolyIoU<T>(bbox_data + i * box_size, bbox_data + index * box_size,
+                       box_size, normalized);
+      }
+
+      if (overlap > nms_threshold) {
+        PolyWeightedMerge(bbox_data + i * box_size,
+                          bbox_data + index * box_size, scores[i],
+                          scores[index], box_size);
+        scores[index] += scores[i];
+      } else {
+        skip[index] = false;
+        index = i;
+      }
+    } else {
+      index = i;
+    }
+  }
+
+  if (index > -1) {
+    skip[index] = false;
+  }
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (scores[i] > threshold && skip[i] == false) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <typename T>
+class LocalityAwareNMSKernel : public framework::OpKernel<T> {
+ public:
+  void LocalityAwareNMSFast(Tensor* bbox, Tensor* scores,
+                            const T score_threshold, const T nms_threshold,
+                            const T eta, const int64_t top_k,
+                            std::vector<int>* selected_indices,
+                            const bool normalized) const {
+    // The total boxes for each instance.
+    int64_t num_boxes = bbox->dims()[0];
+    // 4: [xmin ymin xmax ymax]
+    // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
+    // 16, 24, or 32: [x1 y1 x2 y2 ...  xn yn], n = 8, 12 or 16
+    int64_t box_size = bbox->dims()[1];
+
+    std::vector<std::pair<T, int>> sorted_indices;
+    T adaptive_threshold = nms_threshold;
+    T* bbox_data = bbox->data<T>();
+    T* scores_data = scores->data<T>();
+
+    GetMaxScoreIndexWithLocalityAware(
+        scores_data, bbox_data, box_size, score_threshold, top_k, num_boxes,
+        &sorted_indices, nms_threshold, normalized);
+
+    selected_indices->clear();
+
+    while (sorted_indices.size() != 0) {
+      const int idx = sorted_indices.front().second;
+      bool keep = true;
+      for (size_t k = 0; k < selected_indices->size(); ++k) {
+        if (keep) {
+          const int kept_idx = (*selected_indices)[k];
+          T overlap = T(0.);
+          // 4: [xmin ymin xmax ymax]
+          if (box_size == 4) {
+            overlap =
+                JaccardOverlap<T>(bbox_data + idx * box_size,
+                                  bbox_data + kept_idx * box_size, normalized);
+          }
+          // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
+          if (box_size == 8 || box_size == 16 || box_size == 24 ||
+              box_size == 32) {
+            overlap = PolyIoU<T>(bbox_data + idx * box_size,
+                                 bbox_data + kept_idx * box_size, box_size,
+                                 normalized);
+          }
+          keep = overlap <= adaptive_threshold;
+        } else {
+          break;
+        }
+      }
+      if (keep) {
+        selected_indices->push_back(idx);
+      }
+      sorted_indices.erase(sorted_indices.begin());
+      if (keep && eta < 1 && adaptive_threshold > 0.5) {
+        adaptive_threshold *= eta;
+      }
+    }
+    //    delete bbox_data;
+  }
+
+  void LocalityAwareNMS(const framework::ExecutionContext& ctx, Tensor* scores,
+                        Tensor* bboxes, const int scores_size,
+                        std::map<int, std::vector<int>>* indices,
+                        int* num_nmsed_out) const {
+    int64_t background_label = ctx.Attr<int>("background_label");
+    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
+    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
+    bool normalized = ctx.Attr<bool>("normalized");
+    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
+    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
+    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
+
+    int num_det = 0;
+
+    int64_t class_num = scores->dims()[0];
+    Tensor bbox_slice, score_slice;
+    for (int64_t c = 0; c < class_num; ++c) {
+      if (c == background_label) continue;
+
+      score_slice = scores->Slice(c, c + 1);
+      bbox_slice = *bboxes;
+
+      LocalityAwareNMSFast(&bbox_slice, &score_slice, score_threshold,
+                           nms_threshold, nms_eta, nms_top_k, &((*indices)[c]),
+                           normalized);
+      num_det += (*indices)[c].size();
+    }
+
+    *num_nmsed_out = num_det;
+    const T* scores_data = scores->data<T>();
+    if (keep_top_k > -1 && num_det > keep_top_k) {
+      const T* sdata;
+      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+      for (const auto& it : *indices) {
+        int label = it.first;
+
+        sdata = scores_data + label * scores->dims()[1];
+
+        const std::vector<int>& label_indices = it.second;
+        for (size_t j = 0; j < label_indices.size(); ++j) {
+          int idx = label_indices[j];
+          score_index_pairs.push_back(
+              std::make_pair(sdata[idx], std::make_pair(label, idx)));
+        }
+      }
+      // Keep top k results per image.
+      std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                       SortScorePairDescend<std::pair<int, int>>);
+      score_index_pairs.resize(keep_top_k);
+
+      // Store the new indices.
+      std::map<int, std::vector<int>> new_indices;
+      for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+        int label = score_index_pairs[j].second.first;
+        int idx = score_index_pairs[j].second.second;
+        new_indices[label].push_back(idx);
+      }
+
+      new_indices.swap(*indices);
+      *num_nmsed_out = keep_top_k;
+    }
+  }
+
+  void LocalityAwareNMSOutput(
+      const platform::DeviceContext& ctx, const Tensor& scores,
+      const Tensor& bboxes,
+      const std::map<int, std::vector<int>>& selected_indices,
+      const int scores_size, Tensor* outs, int* oindices = nullptr,
+      const int offset = 0) const {
+    int64_t predict_dim = scores.dims()[1];
+    int64_t box_size = bboxes.dims()[1];
+    if (scores_size == 2) {
+      box_size = bboxes.dims()[2];
+    }
+    int64_t out_dim = box_size + 2;
+    auto* scores_data = scores.data<T>();
+    auto* bboxes_data = bboxes.data<T>();
+    auto* odata = outs->data<T>();
+    const T* sdata;
+    Tensor bbox;
+    bbox.Resize({scores.dims()[0], box_size});
+    int count = 0;
+    for (const auto& it : selected_indices) {
+      int label = it.first;
+      const std::vector<int>& indices = it.second;
+      sdata = scores_data + label * predict_dim;
+      for (size_t j = 0; j < indices.size(); ++j) {
+        int idx = indices[j];
+
+        odata[count * out_dim] = label;  // label
+        const T* bdata;
+        bdata = bboxes_data + idx * box_size;
+        odata[count * out_dim + 1] = sdata[idx];  // score
+        if (oindices != nullptr) {
+          oindices[count] = offset + idx;
+        }
+
+        // xmin, ymin, xmax, ymax or multi-points coordinates
+        std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
+        count++;
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* boxes_input = ctx.Input<LoDTensor>("BBoxes");
+    auto* scores_input = ctx.Input<LoDTensor>("Scores");
+    auto* outs = ctx.Output<LoDTensor>("Out");
+    auto score_dims = scores_input->dims();
+    auto score_size = score_dims.size();
+    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+
+    LoDTensor scores;
+    LoDTensor boxes;
+    TensorCopySync(*scores_input, platform::CPUPlace(), &scores);
+    TensorCopySync(*boxes_input, platform::CPUPlace(), &boxes);
+    std::vector<std::map<int, std::vector<int>>> all_indices;
+    std::vector<size_t> batch_starts = {0};
+    int64_t batch_size = score_dims[0];
+    int64_t box_dim = boxes.dims()[2];
+    int64_t out_dim = box_dim + 2;
+    int num_nmsed_out = 0;
+    Tensor boxes_slice, scores_slice;
+    int n = batch_size;
+    for (int i = 0; i < n; ++i) {
+      scores_slice = scores.Slice(i, i + 1);
+      scores_slice.Resize({score_dims[1], score_dims[2]});
+      boxes_slice = boxes.Slice(i, i + 1);
+      boxes_slice.Resize({score_dims[2], box_dim});
+
+      std::map<int, std::vector<int>> indices;
+      LocalityAwareNMS(ctx, &scores_slice, &boxes_slice, score_size, &indices,
+                       &num_nmsed_out);
+      all_indices.push_back(indices);
+      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+    }
+
+    int num_kept = batch_starts.back();
+    if (num_kept == 0) {
+      T* od = outs->mutable_data<T>({1, 1}, ctx.GetPlace());
+      od[0] = -1;
+      batch_starts = {0, 1};
+    } else {
+      outs->mutable_data<T>({num_kept, out_dim}, ctx.GetPlace());
+      int offset = 0;
+      int* oindices = nullptr;
+      for (int i = 0; i < n; ++i) {
+        scores_slice = scores.Slice(i, i + 1);
+        boxes_slice = boxes.Slice(i, i + 1);
+        scores_slice.Resize({score_dims[1], score_dims[2]});
+        boxes_slice.Resize({score_dims[2], box_dim});
+
+        int64_t s = batch_starts[i];
+        int64_t e = batch_starts[i + 1];
+        if (e > s) {
+          Tensor out = outs->Slice(s, e);
+          LocalityAwareNMSOutput(dev_ctx, scores_slice, boxes_slice,
+                                 all_indices[i], score_dims.size(), &out,
+                                 oindices, offset);
+        }
+      }
+    }
+
+    framework::LoD lod;
+    lod.emplace_back(batch_starts);
+    outs->set_lod(lod);
+  }
+};
+
+class LocalityAwareNMSOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("BBoxes",
+             "Two types of bboxes are supported:"
+             "1. (Tensor) A 3-D Tensor with shape "
+             "[N, M, 4 or 8 16 24 32] represents the "
+             "predicted locations of M bounding bboxes, N is the batch size. "
+             "Each bounding box has four coordinate values and the layout is "
+             "[xmin, ymin, xmax, ymax], when box size equals to 4.");
+    AddInput("Scores",
+             "Two types of scores are supported:"
+             "1. (Tensor) A 3-D Tensor with shape [N, C, M] represents the "
+             "predicted confidence predictions. N is the batch size, C is the "
+             "class number, M is number of bounding boxes. For each category "
+             "there are total M scores which corresponding M bounding boxes. "
+             " Please note, M is equal to the 2nd dimension of BBoxes. ");
+    AddAttr<int>(
+        "background_label",
+        "(int, default: -1) "
+        "The index of background label, the background label will be ignored. "
+        "If set to -1, then all categories will be considered.")
+        .SetDefault(-1);
+    AddAttr<float>("score_threshold",
+                   "(float) "
+                   "Threshold to filter out bounding boxes with low "
+                   "confidence score. If not provided, consider all boxes.");
+    AddAttr<int>("nms_top_k",
+                 "(int64_t) "
+                 "Maximum number of detections to be kept according to the "
+                 "confidences aftern the filtering detections based on "
+                 "score_threshold");
+    AddAttr<float>("nms_threshold",
+                   "(float, default: 0.3) "
+                   "The threshold to be used in NMS.")
+        .SetDefault(0.3);
+    AddAttr<float>("nms_eta",
+                   "(float) "
+                   "The parameter for adaptive NMS.")
+        .SetDefault(1.0);
+    AddAttr<int>("keep_top_k",
+                 "(int64_t) "
+                 "Number of total bboxes to be kept per image after NMS "
+                 "step. -1 means keeping all bboxes after NMS step.");
+    AddAttr<bool>("normalized",
+                  "(bool, default true) "
+                  "Whether detections are normalized.")
+        .SetDefault(true);
+    AddOutput("Out",
+              "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
+              "detections. Each row has 6 values: "
+              "[label, confidence, xmin, ymin, xmax, ymax] or "
+              "(LoDTensor) A 2-D LoDTensor with shape [No, 10] represents the "
+              "detections. Each row has 10 values: "
+              "[label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the "
+              "total number of detections in this mini-batch."
+              "For each instance, "
+              "the offsets in first dimension are called LoD, the number of "
+              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
+              "no detected bbox.");
+    AddComment(R"DOC(
+This operator is to do locality-aware non maximum suppression (NMS) on a batched
+of boxes and scores.
+Firstly, this operator merge box and score according their IOU(intersection over union).
+In the NMS step, this operator greedily selects a subset of detection bounding
+boxes that have high scores larger than score_threshold, if providing this
+threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+is larger than -1. Then this operator pruns away boxes that have high IOU
+(intersection over union) overlap with already selected boxes by adaptive
+threshold NMS based on parameters of nms_threshold and nms_eta.
+Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+per image if keep_top_k is larger than -1.
+This operator support multi-class and batched inputs. It applying NMS
+independently for each class. The outputs is a 2-D LoDTenosr, for each
+image, the offsets in first dimension of LoDTensor are called LoD, the number
+of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
+means there is no detected bbox for this image.
+
+Please get more information from the following papers:
+https://arxiv.org/abs/1704.03155.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    locality_aware_nms, ops::LocalityAwareNMSOp, ops::LocalityAwareNMSOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(locality_aware_nms, ops::LocalityAwareNMSKernel<float>,
+                       ops::LocalityAwareNMSKernel<double>);
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index eb9a3c704833abdf5327815b5f15d13bbb8a2a8f..62d6bb3ac15809919157f228ae058c68dd5355f2 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -13,7 +13,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/poly_util.h"
+#include "paddle/fluid/operators/detection/nms_util.h"
 
 namespace paddle {
 namespace operators {
@@ -85,84 +85,6 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
   }
 };
 
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
-    T inter_w = inter_xmax - inter_xmin + norm;
-    T inter_h = inter_ymax - inter_ymin + norm;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-T PolyIoU(const T* box1, const T* box2, const size_t box_size,
-          const bool normalized) {
-  T bbox1_area = PolyArea<T>(box1, box_size, normalized);
-  T bbox2_area = PolyArea<T>(box2, box_size, normalized);
-  T inter_area = PolyOverlapArea<T>(box1, box2, box_size, normalized);
-  if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
-    // If coordinate values are invalid
-    // if area size <= 0,  return 0.
-    return T(0.);
-  } else {
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
 template <class T>
 void SliceOneClass(const platform::DeviceContext& ctx,
                    const framework::Tensor& items, const int class_id,
diff --git a/paddle/fluid/operators/detection/nms_util.h b/paddle/fluid/operators/detection/nms_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..067bfce51949c7526ebe87bb51722327691db555
--- /dev/null
+++ b/paddle/fluid/operators/detection/nms_util.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/operators/detection/poly_util.h"
+
+namespace paddle {
+namespace operators {
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+    const std::vector<T>& scores, const T threshold, int top_k,
+    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
+    T inter_w = inter_xmax - inter_xmin + norm;
+    T inter_h = inter_ymax - inter_ymin + norm;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <class T>
+T PolyIoU(const T* box1, const T* box2, const size_t box_size,
+          const bool normalized) {
+  T bbox1_area = PolyArea<T>(box1, box_size, normalized);
+  T bbox2_area = PolyArea<T>(box2, box_size, normalized);
+  T inter_area = PolyOverlapArea<T>(box1, box2, box_size, normalized);
+  if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
+    // If coordinate values are invalid
+    // if area size <= 0,  return 0.
+    return T(0.);
+  } else {
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index e84510292f26a2732a70695b08948fcdf2c22cc7..fd691348c65228af290e4b44dd444400ab246175 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -53,6 +53,7 @@ __all__ = [
     'yolo_box',
     'box_clip',
     'multiclass_nms',
+    'locality_aware_nms',
     'retinanet_detection_output',
     'distribute_fpn_proposals',
     'box_decoder_and_assign',
@@ -3147,6 +3148,124 @@ def multiclass_nms(bboxes,
     return output
 
 
+def locality_aware_nms(bboxes,
+                       scores,
+                       score_threshold,
+                       nms_top_k,
+                       keep_top_k,
+                       nms_threshold=0.3,
+                       normalized=True,
+                       nms_eta=1.,
+                       background_label=-1,
+                       name=None):
+    """
+    **Local Aware NMS**
+    
+    `Local Aware NMS <https://arxiv.org/abs/1704.03155>`_ is to do locality-aware non maximum
+    suppression (LANMS) on boxes and scores.
+
+    Firstly, this operator merge box and score according their IOU
+    (intersection over union). In the NMS step, this operator greedily selects a
+    subset of detection bounding boxes that have high scores larger than score_threshold,
+    if providing this threshold, then selects the largest nms_top_k confidences scores
+    if nms_top_k is larger than -1. Then this operator pruns away boxes that have high
+    IOU overlap with already selected boxes by adaptive threshold NMS based on parameters
+    of nms_threshold and nms_eta.
+
+    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    per image if keep_top_k is larger than -1.
+
+    Args:
+        bboxes (Variable): A 3-D Tensor with shape [N, M, 4 or 8 16 24 32]
+                           represents the predicted locations of M bounding
+                           bboxes, N is the batch size. Each bounding box
+                           has four coordinate values and the layout is
+                           [xmin, ymin, xmax, ymax], when box size equals to 4.
+                           The data type is float32 or float64.
+        scores (Variable): A 3-D Tensor with shape [N, C, M] represents the
+                           predicted confidence predictions. N is the batch
+                           size, C is the class number, M is number of bounding
+                           boxes. Now only support 1 class. For each category
+                           there are total M scores which corresponding M bounding
+                           boxes. Please note, M is equal to the 2nd dimension of
+                           BBoxes. The data type is float32 or float64.
+        background_label (int): The index of background label, the background
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: -1
+        score_threshold (float): Threshold to filter out bounding boxes with
+                                 low confidence score. If not provided,
+                                 consider all boxes.
+        nms_top_k (int): Maximum number of detections to be kept according to
+                         the confidences aftern the filtering detections based
+                         on score_threshold.
+        nms_threshold (float): The threshold to be used in NMS. Default: 0.3
+        nms_eta (float): The threshold to be used in NMS. Default: 1.0
+        keep_top_k (int): Number of total bboxes to be kept per image after NMS
+                          step. -1 means keeping all bboxes after NMS step.
+        normalized (bool): Whether detections are normalized. Default: True
+        name(str): Name of the locality aware nms op, please refer to :ref:`api_guide_Name` .
+                          Default: None.
+
+    Returns:
+        Variable: A 2-D LoDTensor with shape [No, 6] represents the detections.
+             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+             or A 2-D LoDTensor with shape [No, 10] represents the detections.
+             Each row has 10 values:
+             [label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the
+             total number of detections. If there is no detected boxes for all
+             images, lod will be set to {1} and Out only contains one value
+             which is -1.
+             (After version 1.3, when no boxes detected, the lod is changed
+             from {0} to {1}). The data type is float32 or float64.
+
+
+    Examples:
+        .. code-block:: python
+
+
+            import paddle.fluid as fluid
+            boxes = fluid.data(name='bboxes', shape=[None, 81, 8],
+                                      dtype='float32')
+            scores = fluid.data(name='scores', shape=[None, 1, 81],
+                                      dtype='float32')
+            out = fluid.layers.locality_aware_nms(bboxes=boxes,
+                                              scores=scores,
+                                              score_threshold=0.5,
+                                              nms_top_k=400,
+                                              nms_threshold=0.3,
+                                              keep_top_k=200,
+                                              normalized=False)
+    """
+    shape = scores.shape
+    assert len(shape) == 3, "dim size of scores must be 3"
+    assert shape[
+        1] == 1, "locality_aware_nms only support one class, Tensor score shape must be [N, 1, M]"
+
+    helper = LayerHelper('locality_aware_nms', **locals())
+
+    output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+    out = {'Out': output}
+
+    helper.append_op(
+        type="locality_aware_nms",
+        inputs={'BBoxes': bboxes,
+                'Scores': scores},
+        attrs={
+            'background_label': background_label,
+            'score_threshold': score_threshold,
+            'nms_top_k': nms_top_k,
+            'nms_threshold': nms_threshold,
+            'nms_eta': nms_eta,
+            'keep_top_k': keep_top_k,
+            'nms_eta': nms_eta,
+            'normalized': normalized
+        },
+        outputs={'Out': output})
+    output.stop_gradient = True
+
+    return output
+
+
 def distribute_fpn_proposals(fpn_rois,
                              min_level,
                              max_level,
diff --git a/python/paddle/fluid/tests/unittests/test_locality_aware_nms_op.py b/python/paddle/fluid/tests/unittests/test_locality_aware_nms_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c8526f4df05be9779ce17bdec184b4b8c6dd1bd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_locality_aware_nms_op.py
@@ -0,0 +1,323 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import copy
+from op_test import OpTest
+from test_multiclass_nms_op import iou
+import paddle.fluid as fluid
+
+
+def weight_merge(box1, box2, score1, score2):
+    for i in range(len(box1)):
+        box2[i] = (box1[i] * score1 + box2[i] * score2) / (score1 + score2)
+
+
+def nms(boxes,
+        scores,
+        score_threshold,
+        nms_threshold,
+        top_k=200,
+        normalized=True,
+        eta=1.0):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        score_threshold: (float) The confidence thresh for filtering low
+            confidence boxes.
+        nms_threshold: (float) The overlap thresh for suppressing unnecessary
+            boxes.
+        top_k: (int) The maximum number of box preds to consider.
+        eta: (float) The parameter for adaptive NMS.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+    index = -1
+    for i in range(boxes.shape[0]):
+        if index > -1 and iou(boxes[i], boxes[index],
+                              normalized) > nms_threshold:
+            weight_merge(boxes[i], boxes[index], scores[i], scores[index])
+            scores[index] += scores[i]
+            scores[i] = score_threshold - 1.
+        else:
+            index = i
+
+    all_scores = copy.deepcopy(scores)
+    all_scores = all_scores.flatten()
+
+    selected_indices = np.argwhere(all_scores > score_threshold)
+    selected_indices = selected_indices.flatten()
+    all_scores = all_scores[selected_indices]
+
+    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
+    sorted_scores = all_scores[sorted_indices]
+    sorted_indices = selected_indices[sorted_indices]
+
+    if top_k > -1 and top_k < sorted_indices.shape[0]:
+        sorted_indices = sorted_indices[:top_k]
+        sorted_scores = sorted_scores[:top_k]
+
+    selected_indices = []
+    adaptive_threshold = nms_threshold
+    for i in range(sorted_scores.shape[0]):
+        idx = sorted_indices[i]
+        keep = True
+        for k in range(len(selected_indices)):
+            if keep:
+                kept_idx = selected_indices[k]
+                overlap = iou(boxes[idx], boxes[kept_idx], normalized)
+                keep = True if overlap <= adaptive_threshold else False
+            else:
+                break
+        if keep:
+            selected_indices.append(idx)
+        if keep and eta < 1 and adaptive_threshold > 0.5:
+            adaptive_threshold *= eta
+    return selected_indices
+
+
+def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
+                   nms_top_k, keep_top_k, normalized, shared):
+    if shared:
+        class_num = scores.shape[0]
+        priorbox_num = scores.shape[1]
+    else:
+        box_num = scores.shape[0]
+        class_num = scores.shape[1]
+
+    selected_indices = {}
+    num_det = 0
+    for c in range(class_num):
+        if c == background: continue
+        if shared:
+            indices = nms(boxes, scores[c], score_threshold, nms_threshold,
+                          nms_top_k, normalized)
+        else:
+            indices = nms(boxes[:, c, :], scores[:, c], score_threshold,
+                          nms_threshold, nms_top_k, normalized)
+        selected_indices[c] = indices
+        num_det += len(indices)
+
+    if keep_top_k > -1 and num_det > keep_top_k:
+        score_index = []
+        for c, indices in selected_indices.items():
+            for idx in indices:
+                if shared:
+                    score_index.append((scores[c][idx], c, idx))
+                else:
+                    score_index.append((scores[idx][c], c, idx))
+
+        sorted_score_index = sorted(
+            score_index, key=lambda tup: tup[0], reverse=True)
+        sorted_score_index = sorted_score_index[:keep_top_k]
+        selected_indices = {}
+
+        for _, c, _ in sorted_score_index:
+            selected_indices[c] = []
+        for s, c, idx in sorted_score_index:
+            selected_indices[c].append(idx)
+        if not shared:
+            for labels in selected_indices:
+                selected_indices[labels].sort()
+        num_det = keep_top_k
+
+    return selected_indices, num_det
+
+
+def batched_multiclass_nms(boxes,
+                           scores,
+                           background,
+                           score_threshold,
+                           nms_threshold,
+                           nms_top_k,
+                           keep_top_k,
+                           normalized=True):
+    batch_size = scores.shape[0]
+    num_boxes = scores.shape[2]
+    det_outs = []
+
+    lod = []
+    for n in range(batch_size):
+        nmsed_outs, nmsed_num = multiclass_nms(
+            boxes[n],
+            scores[n],
+            background,
+            score_threshold,
+            nms_threshold,
+            nms_top_k,
+            keep_top_k,
+            normalized,
+            shared=True)
+        lod.append(nmsed_num)
+
+        if nmsed_num == 0:
+            continue
+        tmp_det_out = []
+        for c, indices in nmsed_outs.items():
+            for idx in indices:
+                xmin, ymin, xmax, ymax = boxes[n][idx][:]
+                tmp_det_out.append([
+                    c, scores[n][c][idx], xmin, ymin, xmax, ymax,
+                    idx + n * num_boxes
+                ])
+        sorted_det_out = sorted(
+            tmp_det_out, key=lambda tup: tup[0], reverse=False)
+        det_outs.extend(sorted_det_out)
+    return det_outs, lod
+
+
+class TestLocalAwareNMSOp(OpTest):
+    def set_argument(self):
+        self.score_threshold = 0.01
+
+    def setUp(self):
+        self.set_argument()
+        N = 10
+        M = 1200
+        C = 1
+        BOX_SIZE = 4
+        background = -1
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 10
+        score_threshold = self.score_threshold
+
+        scores = np.random.random((N * M, C)).astype('float32')
+
+        def softmax(x):
+            shiftx = x - np.max(x).clip(-64.)
+            exps = np.exp(shiftx)
+            return exps / np.sum(exps)
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+        scores = np.reshape(scores, (N, M, C))
+        scores = np.transpose(scores, (0, 2, 1))
+
+        boxes = np.random.random((N, M, BOX_SIZE)).astype('float32')
+        boxes[:, :, 0:2] = boxes[:, :, 0:2] * 0.5
+        boxes[:, :, 2:4] = boxes[:, :, 2:4] * 0.5 + 0.5
+
+        boxes_copy = copy.deepcopy(boxes)
+        scores_copy = copy.deepcopy(scores)
+        det_outs, lod = batched_multiclass_nms(
+            boxes_copy, scores_copy, background, score_threshold, nms_threshold,
+            nms_top_k, keep_top_k)
+
+        lod = [1] if not det_outs else lod
+        det_outs = [[-1, 0]] if not det_outs else det_outs
+        det_outs = np.array(det_outs)
+        nmsed_outs = det_outs[:, :-1].astype('float32')
+
+        self.op_type = 'locality_aware_nms'
+        self.inputs = {'BBoxes': boxes, 'Scores': scores}
+        self.outputs = {'Out': (nmsed_outs, [lod])}
+        self.attrs = {
+            'background_label': background,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+            'normalized': True,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLocalAwareNMSOpNoBoxes(TestLocalAwareNMSOp):
+    def set_argument(self):
+        self.score_threshold = 2.0
+
+
+class TestLocalAwareNMSOp4Points(OpTest):
+    def set_argument(self):
+        self.score_threshold = 0.01
+
+    def setUp(self):
+        self.set_argument()
+        N = 2
+        M = 2
+        C = 1
+        BOX_SIZE = 8
+        nms_top_k = 400
+        keep_top_k = 200
+        nms_threshold = 0.3
+        score_threshold = self.score_threshold
+
+        scores = np.array([[[0.76319082, 0.73770091]],
+                           [[0.68513154, 0.45952697]]])
+        boxes = np.array([[[
+            0.42078365, 0.58117018, 2.92776169, 3.28557757, 4.24344318,
+            0.92196165, 2.72370856, -1.66141214
+        ], [
+            0.13856006, 1.86871034, 2.81287224, 3.61381734, 4.5505249,
+            0.51766346, 2.75630304, -1.91459389
+        ]], [[
+            1.57533883, 1.3217477, 3.07904942, 3.89512545, 4.78680923,
+            1.96914586, 3.539482, -1.59739244
+        ], [
+            0.55084125, 1.71596215, 2.52476074, 3.18940435, 5.09035159,
+            0.91959482, 3.71442385, -0.57299128
+        ]]])
+
+        det_outs = np.array([[
+            0., 1.5008917, 0.28206837, 1.2140071, 2.8712926, 3.4469104,
+            4.3943763, 0.7232457, 2.7397292, -1.7858533
+        ], [
+            0., 1.1446586, 1.1640508, 1.4800063, 2.856528, 3.6118112, 4.908667,
+            1.5478, 3.609713, -1.1861432
+        ]])
+        lod = [1, 1]
+        nmsed_outs = det_outs.astype('float32')
+
+        self.op_type = 'locality_aware_nms'
+        self.inputs = {
+            'BBoxes': boxes.astype('float32'),
+            'Scores': scores.astype('float32')
+        }
+        self.outputs = {'Out': (nmsed_outs, [lod])}
+        self.attrs = {
+            'score_threshold': score_threshold,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'background_label': -1,
+            'normalized': False
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLocalityAwareNMSAPI(OpTest):
+    def test_api(self):
+        boxes = fluid.data(name='bboxes', shape=[None, 81, 8], dtype='float32')
+        scores = fluid.data(name='scores', shape=[None, 1, 81], dtype='float32')
+        fluid.layers.locality_aware_nms(
+            bboxes=boxes,
+            scores=scores,
+            score_threshold=0.5,
+            nms_top_k=400,
+            nms_threshold=0.3,
+            keep_top_k=200,
+            normalized=False)
+
+
+if __name__ == '__main__':
+    unittest.main()