diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/gserver/layers/DetectionUtil.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e61adc66e60c54250e4f323452aa13045310879
--- /dev/null
+++ b/paddle/gserver/layers/DetectionUtil.cpp
@@ -0,0 +1,576 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DetectionUtil.h"
+
+namespace paddle {
+
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode) {
+  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
+  bool useGpu = inMatrix.useGpu();
+  if (permMode == kNCHWToNHWC) {
+    size_t inElementCnt = inMatrix.getElementCnt();
+    size_t channels = inElementCnt / (height * width * batchSize);
+    size_t imgSize = height * width;
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t offset = i * (outTotalSize / batchSize) + outOffset;
+      const MatrixPtr inTmp = Matrix::create(
+          const_cast<real*>(inMatrix.getData()) + i * channels * imgSize,
+          channels,
+          imgSize,
+          false,
+          useGpu);
+      MatrixPtr outTmp =
+          Matrix::create(const_cast<real*>(outMatrix.getData()) + offset,
+                         imgSize,
+                         channels,
+                         false,
+                         useGpu);
+      inTmp->transpose(outTmp, false);
+    }
+    return channels * imgSize;
+  } else {
+    LOG(FATAL) << "Unkown permute mode";
+  }
+}
+
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t inTotalSize,
+                            size_t inOffset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode) {
+  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
+  bool useGpu = inMatrix.useGpu();
+  if (permMode == kNHWCToNCHW) {
+    size_t outElementCnt = outMatrix.getElementCnt();
+    size_t channels = outElementCnt / (height * width * batchSize);
+    size_t imgSize = height * width;
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t offset = i * (inTotalSize / batchSize) + inOffset;
+      const MatrixPtr inTmp =
+          Matrix::create(const_cast<real*>(inMatrix.getData()) + offset,
+                         imgSize,
+                         channels,
+                         false,
+                         useGpu);
+      MatrixPtr outTmp = Matrix::create(
+          const_cast<real*>(outMatrix.getData()) + i * channels * imgSize,
+          channels,
+          imgSize,
+          false,
+          useGpu);
+      inTmp->transpose(outTmp, false);
+    }
+    return channels * imgSize;
+  } else {
+    LOG(FATAL) << "Unkown permute mode";
+  }
+}
+
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
+  if (bbox2.xMin > bbox1.xMax || bbox2.xMax < bbox1.xMin ||
+      bbox2.yMin > bbox1.yMax || bbox2.yMax < bbox1.yMin) {
+    return 0.0;
+  } else {
+    real interXMin = std::max(bbox1.xMin, bbox2.xMin);
+    real interYMin = std::max(bbox1.yMin, bbox2.yMin);
+    real interXMax = std::min(bbox1.xMax, bbox2.xMax);
+    real interYMax = std::min(bbox1.yMax, bbox2.yMax);
+
+    real interWidth = interXMax - interXMin;
+    real interHeight = interYMax - interYMin;
+    real interArea = interWidth * interHeight;
+
+    real bboxArea1 = bbox1.getArea();
+    real bboxArea2 = bbox2.getArea();
+
+    return interArea / (bboxArea1 + bboxArea2 - interArea);
+  }
+}
+
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec) {
+  real priorBBoxWidth = priorBBox.getWidth();
+  real priorBBoxHeight = priorBBox.getHeight();
+  real priorBBoxCenterX = priorBBox.getCenterX();
+  real priorBBoxCenterY = priorBBox.getCenterY();
+
+  real gtBBoxWidth = gtBBox.getWidth();
+  real gtBBoxHeight = gtBBox.getHeight();
+  real gtBBoxCenterX = gtBBox.getCenterX();
+  real gtBBoxCenterY = gtBBox.getCenterY();
+
+  outVec.clear();
+  outVec.push_back((gtBBoxCenterX - priorBBoxCenterX) / priorBBoxWidth /
+                   priorBBoxVar[0]);
+  outVec.push_back((gtBBoxCenterY - priorBBoxCenterY) / priorBBoxHeight /
+                   priorBBoxVar[1]);
+  outVec.push_back(std::log(std::fabs(gtBBoxWidth / priorBBoxWidth)) /
+                   priorBBoxVar[2]);
+  outVec.push_back(std::log(std::fabs(gtBBoxHeight / priorBBoxHeight)) /
+                   priorBBoxVar[3]);
+}
+
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData) {
+  real priorBBoxWidth = priorBBox.getWidth();
+  real priorBBoxHeight = priorBBox.getHeight();
+  real priorBBoxCenterX = priorBBox.getCenterX();
+  real priorBBoxCenterY = priorBBox.getCenterY();
+
+  real decodedBBoxCenterX =
+      priorBBoxVar[0] * locPredData[0] * priorBBoxWidth + priorBBoxCenterX;
+  real decodedBBoxCenterY =
+      priorBBoxVar[1] * locPredData[1] * priorBBoxHeight + priorBBoxCenterY;
+  real decodedBBoxWidth =
+      std::exp(priorBBoxVar[2] * locPredData[2]) * priorBBoxWidth;
+  real decodedBBoxHeight =
+      std::exp(priorBBoxVar[3] * locPredData[3]) * priorBBoxHeight;
+
+  NormalizedBBox decodedBBox;
+  decodedBBox.xMin = decodedBBoxCenterX - decodedBBoxWidth / 2;
+  decodedBBox.yMin = decodedBBoxCenterY - decodedBBoxHeight / 2;
+  decodedBBox.xMax = decodedBBoxCenterX + decodedBBoxWidth / 2;
+  decodedBBox.yMax = decodedBBoxCenterY + decodedBBoxHeight / 2;
+
+  return decodedBBox;
+}
+
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  bboxVec.resize(bboxVec.size() + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    NormalizedBBox bbox;
+    bbox.xMin = *(priorData + i * 8);
+    bbox.yMin = *(priorData + i * 8 + 1);
+    bbox.xMax = *(priorData + i * 8 + 2);
+    bbox.yMax = *(priorData + i * 8 + 3);
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec) {
+  size_t outOffset = varVec.size();
+  varVec.resize(varVec.size() + num);
+  for (size_t i = 0; i < num; ++i) {
+    vector<real> var;
+    var.push_back(*(priorData + i * 8 + 4));
+    var.push_back(*(priorData + i * 8 + 5));
+    var.push_back(*(priorData + i * 8 + 6));
+    var.push_back(*(priorData + i * 8 + 7));
+    varVec[outOffset + i] = var;
+  }
+}
+
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  bboxVec.resize(bboxVec.size() + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    NormalizedBBox bbox;
+    bbox.xMin = *(labelData + i * 6 + 1);
+    bbox.yMin = *(labelData + i * 6 + 2);
+    bbox.xMax = *(labelData + i * 6 + 3);
+    bbox.yMax = *(labelData + i * 6 + 4);
+    real isDifficult = *(labelData + i * 6 + 5);
+    if (std::abs(isDifficult - 0.0) < 1e-6)
+      bbox.isDifficult = false;
+    else
+      bbox.isDifficult = true;
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  labelVec.resize(outOffset + numBBoxes);
+  scoreVec.resize(outOffset + numBBoxes);
+  bboxVec.resize(outOffset + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    labelVec[outOffset + i] = *(detectData + i * 7 + 1);
+    scoreVec[outOffset + i] = *(detectData + i * 7 + 2);
+    NormalizedBBox bbox;
+    bbox.xMin = *(detectData + i * 7 + 3);
+    bbox.yMin = *(detectData + i * 7 + 4);
+    bbox.xMax = *(detectData + i * 7 + 5);
+    bbox.yMax = *(detectData + i * 7 + 6);
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps) {
+  map<size_t, map<size_t, real>> overlaps;
+  size_t numPriors = priorBBoxes.size();
+  size_t numGTs = gtBBoxes.size();
+
+  matchIndices->clear();
+  matchIndices->resize(numPriors, -1);
+  matchOverlaps->clear();
+  matchOverlaps->resize(numPriors, 0.0);
+
+  // Store the positive overlap between predictions and ground truth
+  for (size_t i = 0; i < numPriors; ++i) {
+    for (size_t j = 0; j < numGTs; ++j) {
+      real overlap = jaccardOverlap(priorBBoxes[i], gtBBoxes[j]);
+      if (overlap > 1e-6) {
+        (*matchOverlaps)[i] = std::max((*matchOverlaps)[i], overlap);
+        overlaps[i][j] = overlap;
+      }
+    }
+  }
+  // Bipartite matching
+  vector<int> gtPool;
+  for (size_t i = 0; i < numGTs; ++i) {
+    gtPool.push_back(i);
+  }
+  while (gtPool.size() > 0) {
+    // Find the most overlapped gt and corresponding predictions
+    int maxPriorIdx = -1;
+    int maxGTIdx = -1;
+    real maxOverlap = -1.0;
+    for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
+         it != overlaps.end();
+         ++it) {
+      size_t i = it->first;
+      if ((*matchIndices)[i] != -1) {
+        // The prediction already has matched ground truth or is ignored
+        continue;
+      }
+      for (size_t p = 0; p < gtPool.size(); ++p) {
+        int j = gtPool[p];
+        if (it->second.find(j) == it->second.end()) {
+          // No overlap between the i-th prediction and j-th ground truth
+          continue;
+        }
+        // Find the maximum overlapped pair
+        if (it->second[j] > maxOverlap) {
+          maxPriorIdx = (int)i;
+          maxGTIdx = (int)j;
+          maxOverlap = it->second[j];
+        }
+      }
+    }
+    if (maxPriorIdx == -1) {
+      break;
+    } else {
+      (*matchIndices)[maxPriorIdx] = maxGTIdx;
+      (*matchOverlaps)[maxPriorIdx] = maxOverlap;
+      gtPool.erase(std::find(gtPool.begin(), gtPool.end(), maxGTIdx));
+    }
+  }
+
+  // Get most overlaped for the rest prediction bboxes
+  for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
+       it != overlaps.end();
+       ++it) {
+    size_t i = it->first;
+    if ((*matchIndices)[i] != -1) {
+      // The prediction already has matched ground truth or is ignored
+      continue;
+    }
+    int maxGTIdx = -1;
+    real maxOverlap = -1;
+    for (size_t j = 0; j < numGTs; ++j) {
+      if (it->second.find(j) == it->second.end()) {
+        // No overlap between the i-th prediction and j-th ground truth
+        continue;
+      }
+      // Find the maximum overlapped pair
+      real overlap = it->second[j];
+      if (overlap > maxOverlap && overlap >= overlapThreshold) {
+        maxGTIdx = j;
+        maxOverlap = overlap;
+      }
+    }
+    if (maxGTIdx != -1) {
+      (*matchIndices)[i] = maxGTIdx;
+      (*matchOverlaps)[i] = maxOverlap;
+    }
+  }
+}
+
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr) {
+  vector<NormalizedBBox> priorBBoxes;  // share same prior bboxes
+  getBBoxFromPriorData(priorValue.getData(), numPriorBBoxes, priorBBoxes);
+  size_t totalPos = 0;
+  size_t totalNeg = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    vector<int> matchIndices;
+    vector<int> negIndices;
+    vector<real> matchOverlaps;
+    matchIndices.resize(numPriorBBoxes, -1);
+    matchOverlaps.resize(numPriorBBoxes, 0.0);
+    size_t numGTBBoxes = 0;
+    if (n < seqNum) numGTBBoxes = gtStartPosPtr[n + 1] - gtStartPosPtr[n];
+    if (!numGTBBoxes) {
+      matchIndicesVecPtr->push_back(matchIndices);
+      negIndicesVecPtr->push_back(negIndices);
+      continue;
+    }
+    vector<NormalizedBBox> gtBBoxes;
+    getBBoxFromLabelData(
+        gtValue.getData() + gtStartPosPtr[n] * 6, numGTBBoxes, gtBBoxes);
+
+    matchBBox(
+        priorBBoxes, gtBBoxes, overlapThreshold, &matchIndices, &matchOverlaps);
+
+    size_t numPos = 0;
+    size_t numNeg = 0;
+    for (size_t i = 0; i < matchIndices.size(); ++i)
+      if (matchIndices[i] != -1) ++numPos;
+    totalPos += numPos;
+    vector<pair<real, size_t>> scoresIndices;
+    for (size_t i = 0; i < matchIndices.size(); ++i)
+      if (matchIndices[i] == -1 && matchOverlaps[i] < negOverlapThreshold) {
+        scoresIndices.push_back(std::make_pair(maxConfScore[n][i], i));
+        ++numNeg;
+      }
+    numNeg = std::min(static_cast<size_t>(numPos * negPosRatio), numNeg);
+    std::sort(scoresIndices.begin(),
+              scoresIndices.end(),
+              sortScorePairDescend<size_t>);
+    for (size_t i = 0; i < numNeg; ++i)
+      negIndices.push_back(scoresIndices[i].second);
+    totalNeg += numNeg;
+    matchIndicesVecPtr->push_back(matchIndices);
+    negIndicesVecPtr->push_back(negIndices);
+  }
+  return std::make_pair(totalPos, totalNeg);
+}
+
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr) {
+  maxConfScoreVecPtr->clear();
+  for (size_t i = 0; i < batchSize; ++i) {
+    vector<real> maxConfScore;
+    for (size_t j = 0; j < numPriorBBoxes; ++j) {
+      int offset = j * numClasses;
+      real maxVal = -FLT_MAX;
+      real maxPosVal = -FLT_MAX;
+      real maxScore = 0.0;
+      for (size_t c = 0; c < numClasses; ++c) {
+        maxVal = std::max<real>(confData[offset + c], maxVal);
+        if (c != backgroundId)
+          maxPosVal = std::max<real>(confData[offset + c], maxPosVal);
+      }
+      real sum = 0.0;
+      for (size_t c = 0; c < numClasses; ++c)
+        sum += std::exp(confData[offset + c] - maxVal);
+      maxScore = std::exp(maxPosVal - maxVal) / sum;
+      maxConfScore.push_back(maxScore);
+    }
+    confData += numPriorBBoxes * numClasses;
+    maxConfScoreVecPtr->push_back(maxConfScore);
+  }
+}
+
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices) {
+  vector<pair<real, size_t>> scores;
+  for (size_t i = 0; i < numPriorBBoxes; ++i) {
+    size_t confOffset = i * numClasses + classIdx;
+    if (confScoreData[confOffset] > confThreshold)
+      scores.push_back(std::make_pair(confScoreData[confOffset], i));
+  }
+  std::stable_sort(scores.begin(), scores.end(), sortScorePairDescend<size_t>);
+  if (topK > 0 && topK < scores.size()) scores.resize(topK);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t savedIdx = (*indices)[i];
+        real overlap = jaccardOverlap(bboxes[idx], bboxes[savedIdx]);
+        keep = overlap <= nmsThreshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const size_t confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices) {
+  size_t totalKeepNum = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+    size_t numDetected = 0;
+    map<size_t, vector<size_t>> indices;
+    size_t confOffset = n * numPriorBBoxes * numClasses;
+    for (size_t c = 0; c < numClasses; ++c) {
+      if (c == backgroundId) continue;
+      applyNMSFast(decodedBBoxes,
+                   confData + confOffset,
+                   c,
+                   nmsTopK,
+                   confThreshold,
+                   nmsThreshold,
+                   numPriorBBoxes,
+                   numClasses,
+                   &(indices[c]));
+      numDetected += indices[c].size();
+    }
+    if (keepTopK > 0 && numDetected > keepTopK) {
+      vector<pair<real, pair<size_t, size_t>>> scoreIndexPairs;
+      for (size_t c = 0; c < numClasses; ++c) {
+        const vector<size_t>& labelIndices = indices[c];
+        for (size_t i = 0; i < labelIndices.size(); ++i) {
+          size_t idx = labelIndices[i];
+          scoreIndexPairs.push_back(
+              std::make_pair((confData + confOffset)[idx * numClasses + c],
+                             std::make_pair(c, idx)));
+        }
+      }
+      std::sort(scoreIndexPairs.begin(),
+                scoreIndexPairs.end(),
+                sortScorePairDescend<pair<size_t, size_t>>);
+      scoreIndexPairs.resize(keepTopK);
+      map<size_t, vector<size_t>> newIndices;
+      for (size_t i = 0; i < scoreIndexPairs.size(); ++i) {
+        size_t label = scoreIndexPairs[i].second.first;
+        size_t idx = scoreIndexPairs[i].second.second;
+        newIndices[label].push_back(idx);
+      }
+      allDetectionIndices->push_back(newIndices);
+      totalKeepNum += keepTopK;
+    } else {
+      allDetectionIndices->push_back(indices);
+      totalKeepNum += numDetected;
+    }
+  }
+  return totalKeepNum;
+}
+
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out) {
+  MatrixPtr outBuffer;
+  Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
+  real* bufferData = outBuffer->getData();
+  size_t count = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();
+         it != allIndices[n].end();
+         ++it) {
+      size_t label = it->first;
+      const vector<size_t>& indices = it->second;
+      const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        size_t confOffset = n * numPriorBBoxes * numClasses + idx * numClasses;
+        bufferData[count * 7] = n;
+        bufferData[count * 7 + 1] = label;
+        bufferData[count * 7 + 2] = (confData + confOffset)[label];
+        NormalizedBBox clippedBBox = clipBBox(decodedBBoxes[idx]);
+        bufferData[count * 7 + 3] = clippedBBox.xMin;
+        bufferData[count * 7 + 4] = clippedBBox.yMin;
+        bufferData[count * 7 + 5] = clippedBBox.xMax;
+        bufferData[count * 7 + 6] = clippedBBox.yMax;
+        ++count;
+      }
+    }
+  }
+  out.copyFrom(bufferData, numKept * 7);
+}
+
+NormalizedBBox clipBBox(const NormalizedBBox& bbox) {
+  real realOne = static_cast<real>(1.0);
+  real realZero = static_cast<real>(0.0);
+  NormalizedBBox clippedBBox;
+  clippedBBox.xMin = std::max(std::min(bbox.xMin, realOne), realZero);
+  clippedBBox.yMin = std::max(std::min(bbox.yMin, realOne), realZero);
+  clippedBBox.xMax = std::max(std::min(bbox.xMax, realOne), realZero);
+  clippedBBox.yMax = std::max(std::min(bbox.yMax, realOne), realZero);
+  return clippedBBox;
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionUtil.h b/paddle/gserver/layers/DetectionUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe4f9f075e4cf011c97f68f49598a828d62327b3
--- /dev/null
+++ b/paddle/gserver/layers/DetectionUtil.h
@@ -0,0 +1,307 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <float.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/math/Matrix.h"
+
+using std::vector;
+using std::pair;
+using std::map;
+
+namespace paddle {
+
+template <typename T>
+struct BBoxBase {
+  BBoxBase(T xMin, T yMin, T xMax, T yMax)
+      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
+
+  BBoxBase() {}
+
+  T getWidth() const { return xMax - xMin; }
+
+  T getHeight() const { return yMax - yMin; }
+
+  T getCenterX() const { return (xMin + xMax) / 2; }
+
+  T getCenterY() const { return (yMin + yMax) / 2; }
+
+  T getArea() const { return getWidth() * getHeight(); }
+
+  // coordinate of bounding box
+  T xMin;
+  T yMin;
+  T xMax;
+  T yMax;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool isDifficult;
+};
+
+struct NormalizedBBox : BBoxBase<real> {
+  NormalizedBBox() : BBoxBase<real>() {}
+};
+
+enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
+
+/**
+ * @brief First permute input maxtrix then append to output matrix
+ */
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode);
+
+/**
+ * @brief First permute input maxtrix then decompose to output
+ */
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t totalSize,
+                            size_t offset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode);
+
+/**
+ * @brief Compute jaccard overlap between two bboxes.
+ * @param bbox1 The first bbox
+ * @param bbox2 The second bbox
+ */
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+/**
+ * @brief Compute offset parameters between prior bbox and ground truth bbox
+ * and variances of prior bbox are considered
+ * @param priorBBox Input prior bbox
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param gtBBox Groundtruth bbox
+ * @param outVec Output vector
+ */
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec);
+
+/**
+ * @brief Decode prior bbox with offset parameters
+ * and variances of prior bbox are considered
+ * @param priorBBox Prior bbox to be decoded
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param locPredData Offset parameters
+ */
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData);
+
+/**
+ * @brief Extract bboxes from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract labels, scores and bboxes from detection matrix, the layout is
+ * imageId | label | score | xmin | ymin | xmax | ymax
+ * @param detectData Matrix of detection value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param labelVec Label of bbox
+ * @param scoreVec Score of bbox
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract variances from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param num Number to be extracted
+ * @param varVec Append to the vector
+ */
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec);
+
+/**
+ * @brief Extract bboxes from label matrix, the layout is
+ * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
+ * @param labelData Matrix of label value
+ * @param numBBoxes Number to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+* @brief Match prior bbox to groundtruth bbox, the strategy is:
+1. Find the most overlaped bbox pair (prior and groundtruth)
+2. For rest of prior bboxes find the most overlaped groundtruth bbox
+* @param priorBBoxes prior bbox
+* @param gtBBoxes groundtruth bbox
+* @param overlapThreshold Low boundary of overlap (judge whether matched)
+* @param matchIndices For each prior bbox, groundtruth bbox index if matched
+otherwise -1
+* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
+*/
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps);
+
+/**
+* @brief Generate positive bboxes and negative bboxes,
+|positive bboxes|/|negative bboxes| is negPosRatio
+* @param priorValue Prior value
+* @param numPriorBBoxes Number of prior bbox
+* @param gtValue Groundtruth value
+* @param gtStartPosPtr Since groundtruth value stored as sequence type,
+this parameter indicates start position of each record
+* @param seqNum Number of sequence
+* @param maxConfScore Classification score for prior bbox, used to mine
+negative examples
+* @param batchSize Image number
+* @param overlapThreshold Low boundary of overap
+* @param negOverlapThreshold Upper boundary of overap (judge negative example)
+* @param negPosRatio Control number of negative bboxes
+* @param matchIndicesVecPtr Save indices of matched prior bbox
+* @param negIndicesVecPtr Save indices of negative prior bbox
+*/
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr);
+
+/**
+ * @brief Get max confidence score for each prior bbox
+ * @param confData Confidence scores, layout is
+ * class1 score | class2 score | ... | classN score ...
+ * @param batchSize Image number
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Classes number
+ * @param backgroundId Background id
+ * @param maxConfScoreVecPtr Ouput
+ */
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr);
+
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2);
+
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2);
+
+/**
+ * @brief Do NMS for bboxes to remove duplicated bboxes
+ * @param bboxes BBoxes to apply NMS
+ * @param confScoreData Confidence scores
+ * @param classIdx Class to do NMS
+ * @param topK Number to keep
+ * @param confThreshold Low boundary of confidence score
+ * @param nmsThreshold Threshold of overlap
+ * @param numPriorBBoxes Total number of prior bboxes
+ * @param numClasses Total class number
+ * @param indices Indices of high quality bboxes
+ */
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices);
+
+/**
+ * @brief Get detection results which satify requirements
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param backgroundId Background class
+ * @param batchSize Image number
+ * @param confThreshold Threshold of class confidence
+ * @param nmsTopK Used in NMS operation to keep top k bbox
+ * @param nmsThreshold Used in NMS, threshold of overlap
+ * @param keepTopK How many bboxes keeped in an image
+ * @param allDecodedBBoxes Decoded bboxes for all images
+ * @param allDetectionIndices Save detection bbox indices
+ */
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const size_t confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
+
+/**
+ * @brief Get detection results
+ * @param confData Confidence scores
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param batchSize Image number
+ * @param allIndices Indices of predicted bboxes
+ * @param allDecodedBBoxes BBoxes decoded
+ * @param out Output matrix
+ * image number | label | confidence score | xMin | yMin | xMax | yMax
+ */
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out);
+
+NormalizedBBox clipBBox(const NormalizedBBox& bbox);
+
+}  // namespace paddle