diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/gserver/layers/DetectionOutputLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a4d7f8b5b25c1f83e55a11201a879ce2663c938
--- /dev/null
+++ b/paddle/gserver/layers/DetectionOutputLayer.cpp
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DetectionOutputLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(detection_output, DetectionOutputLayer);
+
+bool DetectionOutputLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  auto& layerConf = config_.inputs(0).detection_output_conf();
+  numClasses_ = layerConf.num_classes();
+  inputNum_ = layerConf.input_num();
+  nmsThreshold_ = layerConf.nms_threshold();
+  confidenceThreshold_ = layerConf.confidence_threshold();
+  nmsTopK_ = layerConf.nms_top_k();
+  keepTopK_ = layerConf.keep_top_k();
+  backgroundId_ = layerConf.background_id();
+  return true;
+}
+
+void DetectionOutputLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+
+  locSizeSum_ = 0;
+  confSizeSum_ = 0;
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    locSizeSum_ += inLoc->getElementCnt();
+    confSizeSum_ += inConf->getElementCnt();
+  }
+
+  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_);
+  locBuffer_ = locTmpBuffer_;
+  confBuffer_ = confTmpBuffer_;
+
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto& layerConf = config_.inputs(0).detection_output_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+    locOffset += appendWithPermute(*inLoc,
+                                   height,
+                                   width,
+                                   locSizeSum_,
+                                   locOffset,
+                                   batchSize,
+                                   *locBuffer_,
+                                   kNCHWToNHWC);
+    confOffset += appendWithPermute(*inConf,
+                                    height,
+                                    width,
+                                    confSizeSum_,
+                                    confOffset,
+                                    batchSize,
+                                    *confBuffer_,
+                                    kNCHWToNHWC);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+
+  MatrixPtr priorValue;
+  if (useGpu_) {
+    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
+    Matrix::resizeOrCreate(
+        confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false);
+    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
+    Matrix::resizeOrCreate(
+        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
+
+    locCpuBuffer_->copyFrom(*locTmpBuffer_);
+    confCpuBuffer_->copyFrom(*confTmpBuffer_);
+    priorCpuValue_->copyFrom(*priorTmpValue);
+
+    locBuffer_ = locCpuBuffer_;
+    confBuffer_ = confCpuBuffer_;
+    priorValue = priorCpuValue_;
+  } else {
+    priorValue = getInputValue(*getPriorBoxLayer());
+  }
+  confBuffer_->softmax(*confBuffer_);
+
+  size_t numPriors = priorValue->getElementCnt() / 8;
+  vector<vector<NormalizedBBox>> allDecodedBBoxes;
+  for (size_t n = 0; n < batchSize; ++n) {
+    vector<NormalizedBBox> decodedBBoxes;
+    for (size_t i = 0; i < numPriors; ++i) {
+      size_t priorOffset = i * 8;
+      size_t locPredOffset = n * numPriors * 4 + i * 4;
+      vector<NormalizedBBox> priorBBoxVec;
+      getBBoxFromPriorData(
+          priorValue->getData() + priorOffset, 1, priorBBoxVec);
+      vector<vector<real>> priorBBoxVar;
+      getBBoxVarFromPriorData(
+          priorValue->getData() + priorOffset, 1, priorBBoxVar);
+      vector<real> locPredData;
+      for (size_t j = 0; j < 4; ++j)
+        locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j));
+      NormalizedBBox bbox =
+          decodeBBoxWithVar(priorBBoxVec[0], priorBBoxVar[0], locPredData);
+      decodedBBoxes.push_back(bbox);
+    }
+    allDecodedBBoxes.push_back(decodedBBoxes);
+  }
+
+  vector<map<size_t, vector<size_t>>> allIndices;
+  size_t numKept = getDetectionIndices(confBuffer_->getData(),
+                                       numPriors,
+                                       numClasses_,
+                                       backgroundId_,
+                                       batchSize,
+                                       confidenceThreshold_,
+                                       nmsTopK_,
+                                       nmsThreshold_,
+                                       keepTopK_,
+                                       allDecodedBBoxes,
+                                       &allIndices);
+
+  resetOutput(numKept, 7);
+  MatrixPtr outV = getOutputValue();
+  getDetectionOutput(confBuffer_->getData(),
+                     numKept,
+                     numPriors,
+                     numClasses_,
+                     batchSize,
+                     allIndices,
+                     allDecodedBBoxes,
+                     *outV);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..38271cb05408d21553e9a34757d16d46c08f2401
--- /dev/null
+++ b/paddle/gserver/layers/DetectionOutputLayer.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "DetectionUtil.h"
+#include "Layer.h"
+
+using std::vector;
+using std::map;
+using std::pair;
+
+namespace paddle {
+
+/**
+ * The detection output layer for a SSD detection task. This layer apply the
+ * Non-maximum suppression to the all predicted bounding box and keep the
+ * Top-K bounding boxes.
+ * - Input: This layer need three input layers: This first input layer
+ *          is the priorbox layer. The rest two input layers are convolution
+ *          layers for generating bbox location offset and the classification
+ *          confidence.
+ * - Output: The predict bounding box location.
+ */
+
+class DetectionOutputLayer : public Layer {
+public:
+  explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr) {}
+
+protected:
+  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
+
+  inline LayerPtr getLocInputLayer(size_t index) {
+    return inputLayers_[1 + index];
+  }
+
+  inline LayerPtr getConfInputLayer(size_t index) {
+    return inputLayers_[1 + inputNum_ + index];
+  }
+
+private:
+  size_t numClasses_;  // number of classes
+  size_t inputNum_;    // number of input layers
+  real nmsThreshold_;
+  real confidenceThreshold_;
+  size_t nmsTopK_;
+  size_t keepTopK_;
+  size_t backgroundId_;
+
+  size_t locSizeSum_;
+  size_t confSizeSum_;
+
+  MatrixPtr locBuffer_;
+  MatrixPtr confBuffer_;
+  MatrixPtr locTmpBuffer_;
+  MatrixPtr confTmpBuffer_;
+  MatrixPtr priorCpuValue_;
+  MatrixPtr locCpuBuffer_;
+  MatrixPtr confCpuBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/gserver/layers/MultiBoxLossLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..27a2cc3fa4a9ecb51430e95f940982d131aa4c4b
--- /dev/null
+++ b/paddle/gserver/layers/MultiBoxLossLayer.cpp
@@ -0,0 +1,365 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MultiBoxLossLayer.h"
+#include <float.h>
+#include <vector>
+#include "DataLayer.h"
+
+using std::vector;
+using std::map;
+using std::pair;
+
+namespace paddle {
+
+REGISTER_LAYER(multibox_loss, MultiBoxLossLayer);
+
+bool MultiBoxLossLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  auto layerConf = config_.inputs(0).multibox_loss_conf();
+  numClasses_ = layerConf.num_classes();
+  inputNum_ = layerConf.input_num();
+  overlapThreshold_ = layerConf.overlap_threshold();
+  negPosRatio_ = layerConf.neg_pos_ratio();
+  negOverlap_ = layerConf.neg_overlap();
+  backgroundId_ = layerConf.background_id();
+  return true;
+}
+
+void MultiBoxLossLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+  resetOutput(batchSize, 1);
+
+  // all location data and confidence score data
+  locSizeSum_ = 0;
+  confSizeSum_ = 0;
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    locSizeSum_ += inLoc->getElementCnt();
+    confSizeSum_ += inConf->getElementCnt();
+  }
+
+  // locBuffer layout:
+  // | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ......
+  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
+  locBuffer_ = locTmpBuffer_;
+
+  // confBuffer layout:
+  // | class1 score | class2 score | ... |classN score | class1 score | ......
+  Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_);
+  confBuffer_ = confTmpBuffer_;
+
+  // concate location data and confidence score data
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto& layerConf = config_.inputs(0).multibox_loss_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+    locOffset += appendWithPermute(*inLoc,
+                                   height,
+                                   width,
+                                   locSizeSum_,
+                                   locOffset,
+                                   batchSize,
+                                   *locBuffer_,
+                                   kNCHWToNHWC);
+    confOffset += appendWithPermute(*inConf,
+                                    height,
+                                    width,
+                                    confSizeSum_,
+                                    confOffset,
+                                    batchSize,
+                                    *confBuffer_,
+                                    kNCHWToNHWC);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+
+  // priorValue layout:
+  // | xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var
+  // | xmin2 | ......
+  MatrixPtr priorValue;
+
+  // labelValue layout:
+  // | class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ......
+  MatrixPtr labelValue;
+
+  // Copy data from GPU to CPU if use GPU
+  if (useGpu_) {
+    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
+    Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false);
+    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
+    Matrix::resizeOrCreate(
+        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
+    MatrixPtr labelTmpValue = getInputValue(*getLabelLayer());
+    Matrix::resizeOrCreate(labelCpuValue_,
+                           labelTmpValue->getHeight(),
+                           labelTmpValue->getWidth(),
+                           false,
+                           false);
+
+    locCpuBuffer_->copyFrom(*locTmpBuffer_);
+    confCpuBuffer_->copyFrom(*confTmpBuffer_);
+    priorCpuValue_->copyFrom(*priorTmpValue);
+    labelCpuValue_->copyFrom(*labelTmpValue);
+
+    locBuffer_ = locCpuBuffer_;
+    confBuffer_ = confCpuBuffer_;
+    priorValue = priorCpuValue_;
+    labelValue = labelCpuValue_;
+  } else {
+    priorValue = getInputValue(*getPriorBoxLayer());
+    labelValue = getInputValue(*getLabelLayer());
+  }
+
+  // Get max scores for each prior bbox. Used in negative mining
+  vector<vector<real>> allMaxConfScore;
+  numPriors_ = priorValue->getElementCnt() / 8;
+  getMaxConfidenceScores(confBuffer_->getData(),
+                         batchSize,
+                         numPriors_,
+                         numClasses_,
+                         backgroundId_,
+                         &allMaxConfScore);
+
+  // Match prior bbox to groundtruth bbox
+  Argument label = getInput(*getLabelLayer());
+  const int* labelIndex = label.sequenceStartPositions->getData(false);
+  size_t seqNum = label.getNumSequences();
+  numMatches_ = 0;
+  numNegs_ = 0;
+  allMatchIndices_.clear();
+  allNegIndices_.clear();
+
+  pair<size_t, size_t> retPair = generateMatchIndices(*priorValue,
+                                                      numPriors_,
+                                                      *labelValue,
+                                                      labelIndex,
+                                                      seqNum,
+                                                      allMaxConfScore,
+                                                      batchSize,
+                                                      overlapThreshold_,
+                                                      negOverlap_,
+                                                      negPosRatio_,
+                                                      &allMatchIndices_,
+                                                      &allNegIndices_);
+  numMatches_ = retPair.first;
+  numNegs_ = retPair.second;
+
+  // BBox location L1 smooth loss
+  locLoss_ = 0.0;
+  if (numMatches_ >= 1) {
+    size_t count = 0;
+    MatrixPtr locLossOutput;
+    Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false);
+    Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false);
+    Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false);
+    locDiff_->zeroMem();
+    vector<real> locGTData;
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;  // match none
+        size_t locOffset =
+            n * (locBuffer_->getElementCnt() / batchSize) + i * 4;
+        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[0];
+        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[1];
+        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[2];
+        locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[3];
+
+        const int gtIdx = allMatchIndices_[n][i];
+        size_t priorOffset = i * 8;
+        vector<NormalizedBBox> priorBBoxVec;
+        getBBoxFromPriorData(
+            priorValue->getData() + priorOffset, 1, priorBBoxVec);
+        vector<vector<real>> priorBBoxVar;
+        getBBoxVarFromPriorData(
+            priorValue->getData() + priorOffset, 1, priorBBoxVar);
+        size_t labelOffset = (labelIndex[n] + gtIdx) * 6;
+        vector<NormalizedBBox> gtBBoxVec;
+        getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec);
+        vector<real> gtEncode;
+        encodeBBoxWithVar(
+            priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode);
+        locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end());
+      }
+    }
+    locGTData_->copyFrom(&locGTData[0], numMatches_ * 4);
+    locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0);
+    locLoss_ = locLossOutput->getSum() / numMatches_;
+  }
+
+  // BBox confidence softmax loss
+  confLoss_ = 0;
+  numConf_ = numMatches_ + numNegs_;
+  if (numConf_ >= 1) {
+    Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false);
+    IVector::resizeOrCreate(confGTData_, numConf_, false);
+    confProb_->zeroMem();
+    size_t count = 0;
+
+    vector<real> confPredData;
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        size_t labelOffset = (labelIndex[n] + allMatchIndices_[n][i]) * 6;
+        const int gtLabel = (labelValue->getData() + labelOffset)[0];
+        confGTData_->getData()[count] = gtLabel;
+        size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_;
+        for (size_t j = 0; j < numClasses_; ++j) {
+          confProb_->getData()[count * numClasses_ + j] =
+              (confBuffer_->getData() + confOffset)[j];
+          confPredData.push_back((confBuffer_->getData() + confOffset)[j]);
+        }
+        ++count;
+      }
+      // Negative mining samples
+      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
+        confGTData_->getData()[count] = backgroundId_;
+        size_t confOffset =
+            n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_;
+        for (size_t j = 0; j < numClasses_; ++j) {
+          confProb_->getData()[count * numClasses_ + j] =
+              (confBuffer_->getData() + confOffset)[j];
+          confPredData.push_back((confBuffer_->getData() + confOffset)[j]);
+        }
+        count++;
+      }
+    }
+    confProb_->softmax(*confProb_);
+    MatrixPtr confLossOutput;
+    Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false);
+    confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_);
+    confLoss_ = confLossOutput->getSum() / numMatches_;
+  }
+  real loss = locLoss_ + confLoss_;
+  MatrixPtr outV = getOutputValue();
+  vector<real> tmp(batchSize, loss);
+  outV->copyFrom(&tmp[0], batchSize);
+}
+
+void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+  locBuffer_->zeroMem();
+  confBuffer_->zeroMem();
+
+  // Back propagate on location prediction
+  if (numMatches_ >= 1) {
+    MatrixPtr locDiffBuffer;
+    Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false);
+    locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0);
+    locDiff_->copyFrom(*locDiffBuffer);
+    // scale gradient
+    for (size_t i = 0; i < numMatches_ * 4; ++i)
+      locDiff_->getData()[i] *= (1. / numMatches_);
+    // Copy gradient back
+    size_t count = 0;
+    for (size_t n = 0; n < batchSize; ++n)
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        real* locDiffData = locBuffer_->getData() + n * numPriors_ * 4 + i * 4;
+        locDiffData[0] = (locDiff_->getData() + count * 4)[0];
+        locDiffData[1] = (locDiff_->getData() + count * 4)[1];
+        locDiffData[2] = (locDiff_->getData() + count * 4)[2];
+        locDiffData[3] = (locDiff_->getData() + count * 4)[3];
+        ++count;
+      }
+    CHECK_EQ(count, numMatches_);
+  }
+
+  if (numConf_ >= 1) {
+    for (size_t i = 0; i < numConf_; ++i)
+      confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1;
+    for (size_t i = 0; i < numConf_ * numClasses_; ++i)
+      confProb_->getData()[i] *= (1. / numMatches_);
+    size_t count = 0;
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        real* confDiffData = confBuffer_->getData() +
+                             n * numPriors_ * numClasses_ + i * numClasses_;
+        for (size_t j = 0; j < numClasses_; ++j)
+          confDiffData[j] = (confProb_->getData() + count * numClasses_)[j];
+        ++count;
+      }
+      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
+        int idx = allNegIndices_[n][i];
+        real* confDiffData = confBuffer_->getData() +
+                             n * numPriors_ * numClasses_ + idx * numClasses_;
+        for (size_t j = 0; j < numClasses_; ++j)
+          confDiffData[j] = (confProb_->getData() + count * numClasses_)[j];
+        ++count;
+      }
+    }
+    CHECK_EQ(count, numConf_);
+  }
+  if (useGpu_) {
+    locTmpBuffer_->copyFrom(*locCpuBuffer_);
+    confTmpBuffer_->copyFrom(*confCpuBuffer_);
+    locBuffer_ = locTmpBuffer_;
+    confBuffer_ = confTmpBuffer_;
+  }
+  // copy back
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto layerConf = config_.inputs(0).multibox_loss_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n));
+    const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n));
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+
+    // NHWC to NCHW
+    MatrixPtr locGBuffer;
+    Matrix::resizeOrCreate(
+        locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_);
+    MatrixPtr confGBuffer;
+    Matrix::resizeOrCreate(
+        confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_);
+
+    locOffset += decomposeWithPermute(*locBuffer_,
+                                      height,
+                                      width,
+                                      locSizeSum_,
+                                      locOffset,
+                                      batchSize,
+                                      *locGBuffer,
+                                      kNHWCToNCHW);
+    inLocG->add(*locGBuffer);
+    confOffset += decomposeWithPermute(*confBuffer_,
+                                       height,
+                                       width,
+                                       confSizeSum_,
+                                       confOffset,
+                                       batchSize,
+                                       *confGBuffer,
+                                       kNHWCToNCHW);
+    inConfG->add(*confGBuffer);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9767fed7f1c13477a264d975921f3f5fac1ed83a
--- /dev/null
+++ b/paddle/gserver/layers/MultiBoxLossLayer.h
@@ -0,0 +1,103 @@
+/* copyright (c) 2016 paddlepaddle authors. all rights reserve.
+
+licensed under the apache license, version 2.0 (the "license");
+you may not use this file except in compliance with the license.
+you may obtain a copy of the license at
+
+    http://www.apache.org/licenses/license-2.0
+
+unless required by applicable law or agreed to in writing, software
+distributed under the license is distributed on an "as is" basis,
+without warranties or conditions of any kind, either express or implied.
+see the license for the specific language governing permissions and
+limitations under the license. */
+
+#pragma once
+
+#include <vector>
+#include "CostLayer.h"
+#include "DataLayer.h"
+#include "DetectionUtil.h"
+#include "Layer.h"
+
+using std::vector;
+using std::pair;
+
+namespace paddle {
+
+/**
+ * The multibox loss layer for a SSD detection task.
+ * The loss is composed by the location loss and the confidence loss.
+ * The location loss is a smooth L1 loss and the confidence loss is
+ * a softmax loss.
+ * - Input: This layer need four input layers: This first input layer
+ *          is the priorbox layer and the second layer is a label layer.
+ *          The rest two input layers are convolution layers for generating
+ *          bbox location offset and the classification confidence.
+ * - Output: The Single Shot Multibox Detection loss value.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+
+class MultiBoxLossLayer : public CostLayer {
+public:
+  explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {}
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
+
+protected:
+  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
+  inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
+  inline LayerPtr getLocInputLayer(size_t index) {
+    return inputLayers_[2 + index];
+  }
+  inline LayerPtr getConfInputLayer(size_t index) {
+    return inputLayers_[2 + inputNum_ + index];
+  }
+
+protected:
+  size_t numClasses_;
+  real overlapThreshold_;
+  real negPosRatio_;
+  real negOverlap_;
+  size_t inputNum_;
+  size_t backgroundId_;
+
+  real locLoss_;
+  real confLoss_;
+
+  size_t numPriors_;
+  size_t numMatches_;
+  size_t numNegs_;
+  size_t numConf_;
+  size_t locSizeSum_;
+  size_t confSizeSum_;
+
+  vector<vector<int>> allMatchIndices_;
+  vector<vector<int>> allNegIndices_;
+  MatrixPtr locGTData_;
+  IVectorPtr confGTData_;
+
+  MatrixPtr locBuffer_;
+  MatrixPtr confBuffer_;
+  MatrixPtr locDiff_;
+  MatrixPtr confProb_;
+
+  MatrixPtr labelCpuValue_;
+  MatrixPtr priorCpuValue_;
+  MatrixPtr locCpuBuffer_;
+  MatrixPtr confCpuBuffer_;
+  MatrixPtr locTmpBuffer_;
+  MatrixPtr confTmpBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 3c4128b5b8a0ea420bd3027b9a36e5f75087c3cb..92f6cbcfe5a0e23c5939b1689a3e339367450387 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -45,6 +45,13 @@ add_unittest_without_exec(test_PriorBox
 
 add_test(NAME test_PriorBox
     COMMAND test_PriorBox)
+################# test_DetectionOutput #######################
+add_unittest_without_exec(test_DetectionOutput
+    test_DetectionOutput.cpp
+    LayerGradUtil.cpp)
+
+add_test(NAME test_DetectionOutput 
+    COMMAND test_DetectionOutput)
 ################# test_ConvUnify #######################
 add_unittest_without_exec(test_ConvUnify
     test_ConvUnify.cpp
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index a0b1cd471dd02fd20bb2247395bdb74651610bbf..e3591ba4df88f547e48bf07d4339d5f25db95e81 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -387,6 +387,31 @@ void initDataLayer(TestConfig testConf,
         data.value->sigmoid(*data.value);
         data.grad->zeroMem();
         break;
+      case INPUT_SELF_DEFINE_DATA: {
+        size_t height = testConf.inputDefs[i].selfDefinedData->getHeight();
+        size_t width = testConf.inputDefs[i].selfDefinedData->getWidth();
+        CHECK_GT(static_cast<int>(height), 0);
+        CHECK_GT(static_cast<int>(width), 0);
+        data.value = Matrix::create(height, width, false, useGpu);
+        data.grad = Matrix::create(height, width, false, useGpu);
+        data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData);
+        data.grad->zeroMem();
+
+        const std::vector<int>& labelSeqStartPositions =
+            testConf.inputDefs[i].labelSeqStartPositions;
+        if (labelSeqStartPositions.size() != 0) {
+          CHECK(!sequenceStartPositions);
+          CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
+
+          sequenceStartPositions =
+              ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
+          sequenceStartPositions->copyFrom(labelSeqStartPositions.data(),
+                                           labelSeqStartPositions.size(),
+                                           useGpu);
+          data.sequenceStartPositions = sequenceStartPositions;
+        }
+        break;
+      }
       default:
         LOG(FATAL) << " unknown inputType ";
         return;
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 9f68eb64d0b4ad27306d3b20387d74a7e438d910..18a6525a145fbf7539e8e84bd162a3b4345394dc 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -31,7 +31,8 @@ enum InputType {
   INPUT_SEQUENCE_LABEL,
   INPUT_SPARSE_NON_VALUE_DATA,
   INPUT_SPARSE_FLOAT_VALUE_DATA,
-  INPUT_DENSE_DIM_DATA,  // using sequence length to init dense data
+  INPUT_DENSE_DIM_DATA,    // using sequence length to init dense data
+  INPUT_SELF_DEFINE_DATA,  // support customizing for input value
 };
 
 struct ParaSparse {
@@ -66,6 +67,7 @@ struct InputDef {
   bool isStatic;
   std::vector<int> labelInitValue;
   std::vector<int> labelSeqStartPositions;
+  MatrixPtr selfDefinedData;
 
   InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
     inputType = type;
@@ -76,6 +78,20 @@ struct InputDef {
     isStatic = false;
   }
 
+  InputDef(InputType type,
+           string nameIn,
+           MatrixPtr selfDefinedData,
+           std::vector<int> selfDefinedSeqStartPos = {})
+      : labelSeqStartPositions(selfDefinedSeqStartPos),
+        selfDefinedData(selfDefinedData) {
+    inputType = type;
+    name = nameIn;
+    dim = 0;
+    sparse = {""};
+    paraSize = 0;
+    isStatic = false;
+  }
+
   InputDef(InputType type,
            string nameIn,
            size_t dimIn,
diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/gserver/tests/test_DetectionOutput.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ec7a2845028a62c7656e21e2109000ee0af8c6a
--- /dev/null
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
@@ -0,0 +1,191 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+// Do one forward pass of priorBox layer and check to see if its output
+// matches the given result
+void doOneDetectionOutputTest(MatrixPtr& inputLoc,
+                              MatrixPtr& inputConf,
+                              MatrixPtr& inputPriorBox,
+                              size_t feature_map_width,
+                              size_t feature_map_height,
+                              real nms_threshold,
+                              bool use_gpu,
+                              MatrixPtr& result) {
+  // Setting up the detection output layer
+  TestConfig configt;
+  configt.layerConfig.set_type("detection_output");
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  configt.layerConfig.add_inputs();
+  configt.layerConfig.add_inputs();
+
+  DetectionOutputConfig* detOutput = input->mutable_detection_output_conf();
+  detOutput->set_width(feature_map_width);
+  detOutput->set_height(feature_map_height);
+  detOutput->set_nms_threshold(nms_threshold);
+  detOutput->set_num_classes(2);
+  detOutput->set_nms_top_k(20);
+  detOutput->set_keep_top_k(10);
+  detOutput->set_background_id(0);
+  detOutput->set_confidence_threshold(0.01);
+  detOutput->set_input_num(1);
+  configt.inputDefs.push_back({INPUT_DATA_TARGET, "priorbox", 32, 0});
+  configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0});
+  configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0});
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputPriorBox);
+  dataLayers[1]->getOutputValue()->copyFrom(*inputLoc);
+  dataLayers[2]->getOutputValue()->copyFrom(*inputConf);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr detectionOutputLayer;
+  initTestLayer(configt, &layerMap, &parameters, &detectionOutputLayer);
+  detectionOutputLayer->forward(PASS_GC);
+  checkMatrixEqual(detectionOutputLayer->getOutputValue(), result);
+}
+
+TEST(Layer, detectionOutputLayerFwd) {
+  bool useGpu = false;
+  // CPU case 1.
+  MatrixPtr inputLoc;
+  MatrixPtr inputConf;
+  MatrixPtr inputPriorBox;
+  MatrixPtr result, result2, result3, result4;
+  real nmsTreshold = 0.01;
+  real inputLocData[] = {0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1};
+  real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6};
+  real inputPriorBoxData[] = {0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2,
+                              0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2,
+                              0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2,
+                              0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2};
+  real resultData[] = {
+      0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031};
+  inputLoc = Matrix::create(1, 16, false, useGpu);
+  inputConf = Matrix::create(1, 8, false, useGpu);
+  inputPriorBox = Matrix::create(1, 32, false, useGpu);
+  result = Matrix::create(1, 7, false, useGpu);
+  inputLoc->setData(inputLocData);
+  inputConf->setData(inputConfData);
+  inputPriorBox->setData(inputPriorBoxData);
+  result->setData(resultData);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result);
+
+  // CPU case 2.
+  nmsTreshold = 0.2;
+  result2 = Matrix::create(2, 7, false, useGpu);
+  real resultData2[] = {0,
+                        1,
+                        0.68997443,
+                        0.099959746,
+                        0.099959746,
+                        0.50804031,
+                        0.50804031,
+                        0,
+                        1,
+                        0.59868765,
+                        0.29995975,
+                        0.29995975,
+                        0.70804024,
+                        0.70804024};
+  result2->setData(resultData2);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result2);
+
+#ifndef PADDLE_ONLY_CPU
+  // GPU case 1.
+  useGpu = true;
+  inputLoc = Matrix::create(1, 16, false, useGpu);
+  inputConf = Matrix::create(1, 8, false, useGpu);
+  inputPriorBox = Matrix::create(1, 32, false, useGpu);
+  inputLoc->copyFrom(inputLocData, 16);
+  inputConf->copyFrom(inputConfData, 8);
+  inputPriorBox->copyFrom(inputPriorBoxData, 32);
+
+  nmsTreshold = 0.01;
+  result3 = Matrix::create(1, 7, false, useGpu);
+  result3->copyFrom(resultData, 7);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result3);
+
+  // GPU case 2.
+  nmsTreshold = 0.2;
+  result4 = Matrix::create(2, 7, false, useGpu);
+  result4->copyFrom(resultData2, 14);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result4);
+#endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 6adffcf53b7966bd6f3d02970e5f07cc9802f469..9c79bd19ee095fdad83122966fcd4d1c81a8f36a 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1689,6 +1689,70 @@ TEST(Layer, smooth_l1) {
   }
 }
 
+TEST(Layer, multibox_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("multibox_loss");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
+  multiboxLoss->set_num_classes(21);
+  multiboxLoss->set_input_num(1);
+  multiboxLoss->set_overlap_threshold(0.5);
+  multiboxLoss->set_neg_pos_ratio(3);
+  multiboxLoss->set_neg_overlap(0.5);
+  multiboxLoss->set_background_id(0);
+  multiboxLoss->set_height(3);
+  multiboxLoss->set_width(3);
+
+  size_t gtNum = 1;
+  MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
+  labelValue->randomizeUniform();
+  labelValue->add(-0.5);
+  labelValue->sigmoid(*labelValue);
+  real* labelData = labelValue->getData();
+  size_t labelWidth = labelValue->getWidth();
+  for (size_t i = 0; i < gtNum; ++i) {
+    *(labelData + i * labelWidth) = std::rand() % 20 + 1;
+    *(labelData + i * labelWidth + 1) = 0.400259;
+    *(labelData + i * labelWidth + 2) = 0.377857;
+    *(labelData + i * labelWidth + 3) = 0.525712;
+    *(labelData + i * labelWidth + 4) = 0.519368;
+  }
+  vector<int> seqStartPositions(gtNum + 1, 0);
+  for (size_t i = 1; i <= gtNum; ++i) {
+    seqStartPositions[i] = i;
+  }
+
+  // Ensure at lease one matched bbox
+  MatrixPtr priorValue = Matrix::create(1, 72, false, false);
+  priorValue->randomizeUniform();
+  priorValue->add(-0.5);
+  priorValue->sigmoid(*priorValue);
+  real* priorData = priorValue->getData();
+  *(priorData) = 0.424811;
+  *(priorData + 1) = 0.397059;
+  *(priorData + 2) = 0.538905;
+  *(priorData + 3) = 0.447091;
+  *(priorData + 4) = 0.425720;
+  *(priorData + 5) = 0.515228;
+  *(priorData + 6) = 0.519452;
+  *(priorData + 7) = 0.591065;
+
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
+  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
+  config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
+  }
+}
+
 TEST(Layer, TransLayer) {
   TestConfig config;
   const int height = 128;
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 29270829bbc3af6990aaf03a5228ef7f6a892a5c..3d01c23bf96e726e0bb8fb04d95947cfd2650d37 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -266,6 +266,29 @@ message PadConfig {
   repeated uint32 pad_w = 4;
 }
 
+message MultiBoxLossConfig {
+  required uint32 num_classes = 1;
+  required float overlap_threshold = 2;
+  required float neg_pos_ratio = 3;
+  required float neg_overlap = 4;
+  required uint32 background_id = 5;
+  required uint32 input_num = 6;
+  optional uint32 height = 7 [default = 1];
+  optional uint32 width = 8 [default = 1];
+}
+
+message DetectionOutputConfig {
+  required uint32 num_classes = 1;
+  required float nms_threshold = 2;
+  required uint32 nms_top_k = 3;
+  required uint32 background_id = 4;
+  required uint32 input_num = 5;
+  required uint32 keep_top_k = 6;
+  required float confidence_threshold = 7;
+  optional uint32 height = 8 [default = 1];
+  optional uint32 width = 9 [default = 1];
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -284,6 +307,8 @@ message LayerInputConfig {
   optional PriorBoxConfig priorbox_conf = 13;
   optional PadConfig pad_conf = 14;
   optional RowConvConfig row_conv_conf = 15;
+  optional MultiBoxLossConfig multibox_loss_conf = 16;
+  optional DetectionOutputConfig detection_output_conf = 17;
 }
 
 message LayerConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index fc2e3bbcde0e94b6325bd0ca1fd41e088df0b950..c46b335d9927fc6376bef68d918d0db01a1a7747 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1676,6 +1676,52 @@ class PriorBoxLayer(LayerBase):
         self.config.size = size
 
 
+@config_layer('multibox_loss')
+class MultiBoxLossLayer(LayerBase):
+    def __init__(self, name, inputs, input_num, num_classes, overlap_threshold,
+                 neg_pos_ratio, neg_overlap, background_id):
+        super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0,
+                                                inputs)
+        config_assert(
+            len(inputs) == (input_num * 2 + 2),
+            'MultiBoxLossLayer does not have enough inputs')
+        config_assert(num_classes > background_id,
+                      'Classes number must greater than background ID')
+        self.config.inputs[0].multibox_loss_conf.num_classes = num_classes
+        self.config.inputs[
+            0].multibox_loss_conf.overlap_threshold = overlap_threshold
+        self.config.inputs[0].multibox_loss_conf.neg_pos_ratio = neg_pos_ratio
+        self.config.inputs[0].multibox_loss_conf.neg_overlap = neg_overlap
+        self.config.inputs[0].multibox_loss_conf.background_id = background_id
+        self.config.inputs[0].multibox_loss_conf.input_num = input_num
+        self.config.size = 1
+
+
+@config_layer('detection_output')
+class DetectionOutputLayer(LayerBase):
+    def __init__(self, name, inputs, size, input_num, num_classes,
+                 nms_threshold, nms_top_k, keep_top_k, confidence_threshold,
+                 background_id):
+        super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0,
+                                                   inputs)
+        config_assert(
+            len(inputs) == (input_num * 2 + 1),
+            'DetectionOutputLayer does not have enough inputs')
+        config_assert(num_classes > background_id,
+                      'Classes number must greater than background ID')
+        self.config.inputs[0].detection_output_conf.num_classes = num_classes
+        self.config.inputs[
+            0].detection_output_conf.nms_threshold = nms_threshold
+        self.config.inputs[0].detection_output_conf.nms_top_k = nms_top_k
+        self.config.inputs[0].detection_output_conf.keep_top_k = keep_top_k
+        self.config.inputs[
+            0].detection_output_conf.confidence_threshold = confidence_threshold
+        self.config.inputs[
+            0].detection_output_conf.background_id = background_id
+        self.config.inputs[0].detection_output_conf.input_num = input_num
+        self.config.size = size
+
+
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(self, name, size, height=None, width=None, device=None):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 2d8ddbb9007b241eb1986887d8ea6c2de8235c29..770559dc770254369c67db35bd1c36e1ce8f329a 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -115,6 +115,8 @@ __all__ = [
     'print_layer',
     'priorbox_layer',
     'cross_channel_norm_layer',
+    'multibox_loss_layer',
+    'detection_output_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -195,6 +197,8 @@ class LayerType(object):
 
     PRINT_LAYER = 'print'
     PRIORBOX_LAYER = 'priorbox'
+    MULTIBOX_LOSS_LAYER = 'multibox_loss'
+    DETECTION_OUTPUT_LAYER = 'detection_output'
 
     CTC_LAYER = 'ctc'
     WARP_CTC_LAYER = 'warp_ctc'
@@ -1052,6 +1056,163 @@ def priorbox_layer(input,
         size=size)
 
 
+@wrap_name_default("multibox_loss")
+def multibox_loss_layer(input_loc,
+                        input_conf,
+                        priorbox,
+                        label,
+                        num_classes,
+                        overlap_threshold=0.5,
+                        neg_pos_ratio=3.0,
+                        neg_overlap=0.5,
+                        background_id=0,
+                        name=None):
+    """
+    Compute the location loss and the confidence loss for ssd.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input_loc: The input predict location.
+    :type input_loc: LayerOutput
+    :param input_conf: The input priorbox confidence.
+    :type input_conf: LayerOutput
+    :param priorbox: The input priorbox location and the variance.
+    :type priorbox: LayerOutput
+    :param label: The input label.
+    :type label: LayerOutput
+    :param num_classes: The number of the classification.
+    :type num_classes: int
+    :param overlap_threshold: The threshold of the overlap.
+    :type overlap_threshold: float
+    :param neg_pos_ratio: The ratio of the negative bbox to the positive bbox.
+    :type neg_pos_ratio: float
+    :param neg_overlap: The negative bbox overlap threshold.
+    :type neg_overlap: float
+    :param background_id: The background class index.
+    :type background_id: int
+    :return: LayerOutput
+    """
+    input_loc_num = 0
+    input_conf_num = 0
+
+    if isinstance(input_loc, LayerOutput):
+        input_loc = [input_loc]
+    assert isinstance(input_loc, collections.Sequence)  # list or tuple
+    for each in input_loc:
+        assert isinstance(each, LayerOutput)
+        input_loc_num += 1
+
+    if isinstance(input_conf, LayerOutput):
+        input_conf = [input_conf]
+    assert isinstance(input_conf, collections.Sequence)  # list or tuple
+    for each in input_conf:
+        assert isinstance(each, LayerOutput)
+        input_conf_num += 1
+    # Check the input layer number.
+    assert input_loc_num == input_conf_num
+
+    inputs = [priorbox.name, label.name]
+    inputs.extend([l.name for l in input_loc])
+    inputs.extend([l.name for l in input_conf])
+    parents = [priorbox, label]
+    parents.extend(input_loc)
+    parents.extend(input_conf)
+
+    Layer(
+        name=name,
+        type=LayerType.MULTIBOX_LOSS_LAYER,
+        inputs=inputs,
+        input_num=input_loc_num,
+        num_classes=num_classes,
+        overlap_threshold=overlap_threshold,
+        neg_pos_ratio=neg_pos_ratio,
+        neg_overlap=neg_overlap,
+        background_id=background_id)
+    return LayerOutput(
+        name, LayerType.MULTIBOX_LOSS_LAYER, parents=parents, size=1)
+
+
+@wrap_name_default("detection_output")
+def detection_output_layer(input_loc,
+                           input_conf,
+                           priorbox,
+                           num_classes,
+                           nms_threshold=0.45,
+                           nms_top_k=400,
+                           keep_top_k=200,
+                           confidence_threshold=0.01,
+                           background_id=0,
+                           name=None):
+    """
+    Apply the NMS to the output of network and compute the predict bounding
+    box location.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input_loc: The input predict location.
+    :type input_loc: LayerOutput
+    :param input_conf: The input priorbox confidence.
+    :type input_conf: LayerOutput
+    :param priorbox: The input priorbox location and the variance.
+    :type priorbox: LayerOutput
+    :param num_classes: The number of the classification.
+    :type num_classes: int
+    :param nms_threshold: The Non-maximum suppression threshold.
+    :type nms_threshold: float
+    :param nms_top_k: The bbox number kept of the NMS's output
+    :type nms_top_k: int
+    :param keep_top_k: The bbox number kept of the layer's output
+    :type keep_top_k: int
+    :param confidence_threshold: The classification confidence threshold
+    :type confidence_threshold: float
+    :param background_id: The background class index.
+    :type background_id: int
+    :return: LayerOutput
+    """
+    input_loc_num = 0
+    input_conf_num = 0
+
+    if isinstance(input_loc, LayerOutput):
+        input_loc = [input_loc]
+    assert isinstance(input_loc, collections.Sequence)  # list or tuple
+    for each in input_loc:
+        assert isinstance(each, LayerOutput)
+        input_loc_num += 1
+
+    if isinstance(input_conf, LayerOutput):
+        input_conf = [input_conf]
+    assert isinstance(input_conf, collections.Sequence)  # list or tuple
+    for each in input_conf:
+        assert isinstance(each, LayerOutput)
+        input_conf_num += 1
+    # Check the input layer number.
+    assert input_loc_num == input_conf_num
+
+    inputs = [priorbox.name]
+    inputs.extend([l.name for l in input_loc])
+    inputs.extend([l.name for l in input_conf])
+    parents = [priorbox]
+    parents.extend(input_loc)
+    parents.extend(input_conf)
+
+    size = keep_top_k * 7
+
+    Layer(
+        name=name,
+        type=LayerType.DETECTION_OUTPUT_LAYER,
+        inputs=inputs,
+        size=size,
+        input_num=input_loc_num,
+        num_classes=num_classes,
+        nms_threshold=nms_threshold,
+        nms_top_k=nms_top_k,
+        keep_top_k=keep_top_k,
+        confidence_threshold=confidence_threshold,
+        background_id=background_id)
+    return LayerOutput(
+        name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
+
+
 @wrap_name_default("cross_channel_norm")
 def cross_channel_norm_layer(input, name=None, param_attr=None):
     """