From 65969dad641a95a1ac0f744b11c1166a173d169b Mon Sep 17 00:00:00 2001 From: yangyaming Date: Fri, 16 Jun 2017 16:29:08 +0800 Subject: [PATCH] Add DetectionOutputLayer and MultiBoxLossLayer. --- .../gserver/layers/DetectionOutputLayer.cpp | 154 ++++++++ paddle/gserver/layers/DetectionOutputLayer.h | 81 ++++ paddle/gserver/layers/MultiBoxLossLayer.cpp | 365 ++++++++++++++++++ paddle/gserver/layers/MultiBoxLossLayer.h | 103 +++++ paddle/gserver/tests/CMakeLists.txt | 7 + paddle/gserver/tests/LayerGradUtil.cpp | 25 ++ paddle/gserver/tests/LayerGradUtil.h | 18 +- paddle/gserver/tests/test_DetectionOutput.cpp | 191 +++++++++ paddle/gserver/tests/test_LayerGrad.cpp | 64 +++ proto/ModelConfig.proto | 25 ++ python/paddle/trainer/config_parser.py | 46 +++ .../paddle/trainer_config_helpers/layers.py | 161 ++++++++ 12 files changed, 1239 insertions(+), 1 deletion(-) create mode 100644 paddle/gserver/layers/DetectionOutputLayer.cpp create mode 100644 paddle/gserver/layers/DetectionOutputLayer.h create mode 100644 paddle/gserver/layers/MultiBoxLossLayer.cpp create mode 100644 paddle/gserver/layers/MultiBoxLossLayer.h create mode 100644 paddle/gserver/tests/test_DetectionOutput.cpp diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/gserver/layers/DetectionOutputLayer.cpp new file mode 100644 index 00000000000..2a4d7f8b5b2 --- /dev/null +++ b/paddle/gserver/layers/DetectionOutputLayer.cpp @@ -0,0 +1,154 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "DetectionOutputLayer.h" + +namespace paddle { + +REGISTER_LAYER(detection_output, DetectionOutputLayer); + +bool DetectionOutputLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + auto& layerConf = config_.inputs(0).detection_output_conf(); + numClasses_ = layerConf.num_classes(); + inputNum_ = layerConf.input_num(); + nmsThreshold_ = layerConf.nms_threshold(); + confidenceThreshold_ = layerConf.confidence_threshold(); + nmsTopK_ = layerConf.nms_top_k(); + keepTopK_ = layerConf.keep_top_k(); + backgroundId_ = layerConf.background_id(); + return true; +} + +void DetectionOutputLayer::forward(PassType passType) { + Layer::forward(passType); + size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight(); + + locSizeSum_ = 0; + confSizeSum_ = 0; + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + locSizeSum_ += inLoc->getElementCnt(); + confSizeSum_ += inConf->getElementCnt(); + } + + Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_); + Matrix::resizeOrCreate( + confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_); + locBuffer_ = locTmpBuffer_; + confBuffer_ = confTmpBuffer_; + + size_t locOffset = 0; + size_t confOffset = 0; + auto& layerConf = config_.inputs(0).detection_output_conf(); + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + + size_t height = getInput(*getLocInputLayer(n)).getFrameHeight(); + if (!height) height = layerConf.height(); + size_t width = getInput(*getLocInputLayer(n)).getFrameWidth(); + if (!width) width = layerConf.width(); + locOffset += appendWithPermute(*inLoc, + height, + width, + locSizeSum_, + locOffset, + batchSize, + *locBuffer_, + kNCHWToNHWC); + confOffset += appendWithPermute(*inConf, + height, + width, + confSizeSum_, + confOffset, + batchSize, + *confBuffer_, + kNCHWToNHWC); + } + CHECK_EQ(locOffset, locSizeSum_ / batchSize); + CHECK_EQ(confOffset, confSizeSum_ / batchSize); + + MatrixPtr priorValue; + if (useGpu_) { + Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false); + Matrix::resizeOrCreate( + confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false); + MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer()); + Matrix::resizeOrCreate( + priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false); + + locCpuBuffer_->copyFrom(*locTmpBuffer_); + confCpuBuffer_->copyFrom(*confTmpBuffer_); + priorCpuValue_->copyFrom(*priorTmpValue); + + locBuffer_ = locCpuBuffer_; + confBuffer_ = confCpuBuffer_; + priorValue = priorCpuValue_; + } else { + priorValue = getInputValue(*getPriorBoxLayer()); + } + confBuffer_->softmax(*confBuffer_); + + size_t numPriors = priorValue->getElementCnt() / 8; + vector> allDecodedBBoxes; + for (size_t n = 0; n < batchSize; ++n) { + vector decodedBBoxes; + for (size_t i = 0; i < numPriors; ++i) { + size_t priorOffset = i * 8; + size_t locPredOffset = n * numPriors * 4 + i * 4; + vector priorBBoxVec; + getBBoxFromPriorData( + priorValue->getData() + priorOffset, 1, priorBBoxVec); + vector> priorBBoxVar; + getBBoxVarFromPriorData( + priorValue->getData() + priorOffset, 1, priorBBoxVar); + vector locPredData; + for (size_t j = 0; j < 4; ++j) + locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j)); + NormalizedBBox bbox = + decodeBBoxWithVar(priorBBoxVec[0], priorBBoxVar[0], locPredData); + decodedBBoxes.push_back(bbox); + } + allDecodedBBoxes.push_back(decodedBBoxes); + } + + vector>> allIndices; + size_t numKept = getDetectionIndices(confBuffer_->getData(), + numPriors, + numClasses_, + backgroundId_, + batchSize, + confidenceThreshold_, + nmsTopK_, + nmsThreshold_, + keepTopK_, + allDecodedBBoxes, + &allIndices); + + resetOutput(numKept, 7); + MatrixPtr outV = getOutputValue(); + getDetectionOutput(confBuffer_->getData(), + numKept, + numPriors, + numClasses_, + batchSize, + allIndices, + allDecodedBBoxes, + *outV); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h new file mode 100644 index 00000000000..38271cb0540 --- /dev/null +++ b/paddle/gserver/layers/DetectionOutputLayer.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "DetectionUtil.h" +#include "Layer.h" + +using std::vector; +using std::map; +using std::pair; + +namespace paddle { + +/** + * The detection output layer for a SSD detection task. This layer apply the + * Non-maximum suppression to the all predicted bounding box and keep the + * Top-K bounding boxes. + * - Input: This layer need three input layers: This first input layer + * is the priorbox layer. The rest two input layers are convolution + * layers for generating bbox location offset and the classification + * confidence. + * - Output: The predict bounding box location. + */ + +class DetectionOutputLayer : public Layer { +public: + explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + + void forward(PassType passType); + + void backward(const UpdateCallback& callback = nullptr) {} + +protected: + inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; } + + inline LayerPtr getLocInputLayer(size_t index) { + return inputLayers_[1 + index]; + } + + inline LayerPtr getConfInputLayer(size_t index) { + return inputLayers_[1 + inputNum_ + index]; + } + +private: + size_t numClasses_; // number of classes + size_t inputNum_; // number of input layers + real nmsThreshold_; + real confidenceThreshold_; + size_t nmsTopK_; + size_t keepTopK_; + size_t backgroundId_; + + size_t locSizeSum_; + size_t confSizeSum_; + + MatrixPtr locBuffer_; + MatrixPtr confBuffer_; + MatrixPtr locTmpBuffer_; + MatrixPtr confTmpBuffer_; + MatrixPtr priorCpuValue_; + MatrixPtr locCpuBuffer_; + MatrixPtr confCpuBuffer_; +}; + +} // namespace paddle diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/gserver/layers/MultiBoxLossLayer.cpp new file mode 100644 index 00000000000..27a2cc3fa4a --- /dev/null +++ b/paddle/gserver/layers/MultiBoxLossLayer.cpp @@ -0,0 +1,365 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MultiBoxLossLayer.h" +#include +#include +#include "DataLayer.h" + +using std::vector; +using std::map; +using std::pair; + +namespace paddle { + +REGISTER_LAYER(multibox_loss, MultiBoxLossLayer); + +bool MultiBoxLossLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + auto layerConf = config_.inputs(0).multibox_loss_conf(); + numClasses_ = layerConf.num_classes(); + inputNum_ = layerConf.input_num(); + overlapThreshold_ = layerConf.overlap_threshold(); + negPosRatio_ = layerConf.neg_pos_ratio(); + negOverlap_ = layerConf.neg_overlap(); + backgroundId_ = layerConf.background_id(); + return true; +} + +void MultiBoxLossLayer::forward(PassType passType) { + Layer::forward(passType); + size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight(); + resetOutput(batchSize, 1); + + // all location data and confidence score data + locSizeSum_ = 0; + confSizeSum_ = 0; + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + locSizeSum_ += inLoc->getElementCnt(); + confSizeSum_ += inConf->getElementCnt(); + } + + // locBuffer layout: + // | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ...... + Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_); + locBuffer_ = locTmpBuffer_; + + // confBuffer layout: + // | class1 score | class2 score | ... |classN score | class1 score | ...... + Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_); + confBuffer_ = confTmpBuffer_; + + // concate location data and confidence score data + size_t locOffset = 0; + size_t confOffset = 0; + auto& layerConf = config_.inputs(0).multibox_loss_conf(); + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + size_t height = getInput(*getLocInputLayer(n)).getFrameHeight(); + if (!height) height = layerConf.height(); + size_t width = getInput(*getLocInputLayer(n)).getFrameWidth(); + if (!width) width = layerConf.width(); + locOffset += appendWithPermute(*inLoc, + height, + width, + locSizeSum_, + locOffset, + batchSize, + *locBuffer_, + kNCHWToNHWC); + confOffset += appendWithPermute(*inConf, + height, + width, + confSizeSum_, + confOffset, + batchSize, + *confBuffer_, + kNCHWToNHWC); + } + CHECK_EQ(locOffset, locSizeSum_ / batchSize); + CHECK_EQ(confOffset, confSizeSum_ / batchSize); + + // priorValue layout: + // | xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var + // | xmin2 | ...... + MatrixPtr priorValue; + + // labelValue layout: + // | class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...... + MatrixPtr labelValue; + + // Copy data from GPU to CPU if use GPU + if (useGpu_) { + Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false); + Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false); + MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer()); + Matrix::resizeOrCreate( + priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false); + MatrixPtr labelTmpValue = getInputValue(*getLabelLayer()); + Matrix::resizeOrCreate(labelCpuValue_, + labelTmpValue->getHeight(), + labelTmpValue->getWidth(), + false, + false); + + locCpuBuffer_->copyFrom(*locTmpBuffer_); + confCpuBuffer_->copyFrom(*confTmpBuffer_); + priorCpuValue_->copyFrom(*priorTmpValue); + labelCpuValue_->copyFrom(*labelTmpValue); + + locBuffer_ = locCpuBuffer_; + confBuffer_ = confCpuBuffer_; + priorValue = priorCpuValue_; + labelValue = labelCpuValue_; + } else { + priorValue = getInputValue(*getPriorBoxLayer()); + labelValue = getInputValue(*getLabelLayer()); + } + + // Get max scores for each prior bbox. Used in negative mining + vector> allMaxConfScore; + numPriors_ = priorValue->getElementCnt() / 8; + getMaxConfidenceScores(confBuffer_->getData(), + batchSize, + numPriors_, + numClasses_, + backgroundId_, + &allMaxConfScore); + + // Match prior bbox to groundtruth bbox + Argument label = getInput(*getLabelLayer()); + const int* labelIndex = label.sequenceStartPositions->getData(false); + size_t seqNum = label.getNumSequences(); + numMatches_ = 0; + numNegs_ = 0; + allMatchIndices_.clear(); + allNegIndices_.clear(); + + pair retPair = generateMatchIndices(*priorValue, + numPriors_, + *labelValue, + labelIndex, + seqNum, + allMaxConfScore, + batchSize, + overlapThreshold_, + negOverlap_, + negPosRatio_, + &allMatchIndices_, + &allNegIndices_); + numMatches_ = retPair.first; + numNegs_ = retPair.second; + + // BBox location L1 smooth loss + locLoss_ = 0.0; + if (numMatches_ >= 1) { + size_t count = 0; + MatrixPtr locLossOutput; + Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false); + Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false); + Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false); + locDiff_->zeroMem(); + vector locGTData; + + for (size_t n = 0; n < batchSize; ++n) { + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; // match none + size_t locOffset = + n * (locBuffer_->getElementCnt() / batchSize) + i * 4; + locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[0]; + locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[1]; + locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[2]; + locDiff_->getData()[count++] = (locBuffer_->getData() + locOffset)[3]; + + const int gtIdx = allMatchIndices_[n][i]; + size_t priorOffset = i * 8; + vector priorBBoxVec; + getBBoxFromPriorData( + priorValue->getData() + priorOffset, 1, priorBBoxVec); + vector> priorBBoxVar; + getBBoxVarFromPriorData( + priorValue->getData() + priorOffset, 1, priorBBoxVar); + size_t labelOffset = (labelIndex[n] + gtIdx) * 6; + vector gtBBoxVec; + getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec); + vector gtEncode; + encodeBBoxWithVar( + priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode); + locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end()); + } + } + locGTData_->copyFrom(&locGTData[0], numMatches_ * 4); + locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0); + locLoss_ = locLossOutput->getSum() / numMatches_; + } + + // BBox confidence softmax loss + confLoss_ = 0; + numConf_ = numMatches_ + numNegs_; + if (numConf_ >= 1) { + Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false); + IVector::resizeOrCreate(confGTData_, numConf_, false); + confProb_->zeroMem(); + size_t count = 0; + + vector confPredData; + for (size_t n = 0; n < batchSize; ++n) { + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; + size_t labelOffset = (labelIndex[n] + allMatchIndices_[n][i]) * 6; + const int gtLabel = (labelValue->getData() + labelOffset)[0]; + confGTData_->getData()[count] = gtLabel; + size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_; + for (size_t j = 0; j < numClasses_; ++j) { + confProb_->getData()[count * numClasses_ + j] = + (confBuffer_->getData() + confOffset)[j]; + confPredData.push_back((confBuffer_->getData() + confOffset)[j]); + } + ++count; + } + // Negative mining samples + for (size_t i = 0; i < allNegIndices_[n].size(); ++i) { + confGTData_->getData()[count] = backgroundId_; + size_t confOffset = + n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_; + for (size_t j = 0; j < numClasses_; ++j) { + confProb_->getData()[count * numClasses_ + j] = + (confBuffer_->getData() + confOffset)[j]; + confPredData.push_back((confBuffer_->getData() + confOffset)[j]); + } + count++; + } + } + confProb_->softmax(*confProb_); + MatrixPtr confLossOutput; + Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false); + confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_); + confLoss_ = confLossOutput->getSum() / numMatches_; + } + real loss = locLoss_ + confLoss_; + MatrixPtr outV = getOutputValue(); + vector tmp(batchSize, loss); + outV->copyFrom(&tmp[0], batchSize); +} + +void MultiBoxLossLayer::backward(const UpdateCallback& callback) { + size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight(); + locBuffer_->zeroMem(); + confBuffer_->zeroMem(); + + // Back propagate on location prediction + if (numMatches_ >= 1) { + MatrixPtr locDiffBuffer; + Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false); + locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0); + locDiff_->copyFrom(*locDiffBuffer); + // scale gradient + for (size_t i = 0; i < numMatches_ * 4; ++i) + locDiff_->getData()[i] *= (1. / numMatches_); + // Copy gradient back + size_t count = 0; + for (size_t n = 0; n < batchSize; ++n) + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; + real* locDiffData = locBuffer_->getData() + n * numPriors_ * 4 + i * 4; + locDiffData[0] = (locDiff_->getData() + count * 4)[0]; + locDiffData[1] = (locDiff_->getData() + count * 4)[1]; + locDiffData[2] = (locDiff_->getData() + count * 4)[2]; + locDiffData[3] = (locDiff_->getData() + count * 4)[3]; + ++count; + } + CHECK_EQ(count, numMatches_); + } + + if (numConf_ >= 1) { + for (size_t i = 0; i < numConf_; ++i) + confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1; + for (size_t i = 0; i < numConf_ * numClasses_; ++i) + confProb_->getData()[i] *= (1. / numMatches_); + size_t count = 0; + for (size_t n = 0; n < batchSize; ++n) { + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; + real* confDiffData = confBuffer_->getData() + + n * numPriors_ * numClasses_ + i * numClasses_; + for (size_t j = 0; j < numClasses_; ++j) + confDiffData[j] = (confProb_->getData() + count * numClasses_)[j]; + ++count; + } + for (size_t i = 0; i < allNegIndices_[n].size(); ++i) { + int idx = allNegIndices_[n][i]; + real* confDiffData = confBuffer_->getData() + + n * numPriors_ * numClasses_ + idx * numClasses_; + for (size_t j = 0; j < numClasses_; ++j) + confDiffData[j] = (confProb_->getData() + count * numClasses_)[j]; + ++count; + } + } + CHECK_EQ(count, numConf_); + } + if (useGpu_) { + locTmpBuffer_->copyFrom(*locCpuBuffer_); + confTmpBuffer_->copyFrom(*confCpuBuffer_); + locBuffer_ = locTmpBuffer_; + confBuffer_ = confTmpBuffer_; + } + // copy back + size_t locOffset = 0; + size_t confOffset = 0; + auto layerConf = config_.inputs(0).multibox_loss_conf(); + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n)); + const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n)); + size_t height = getInput(*getLocInputLayer(n)).getFrameHeight(); + if (!height) height = layerConf.height(); + size_t width = getInput(*getLocInputLayer(n)).getFrameWidth(); + if (!width) width = layerConf.width(); + + // NHWC to NCHW + MatrixPtr locGBuffer; + Matrix::resizeOrCreate( + locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_); + MatrixPtr confGBuffer; + Matrix::resizeOrCreate( + confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_); + + locOffset += decomposeWithPermute(*locBuffer_, + height, + width, + locSizeSum_, + locOffset, + batchSize, + *locGBuffer, + kNHWCToNCHW); + inLocG->add(*locGBuffer); + confOffset += decomposeWithPermute(*confBuffer_, + height, + width, + confSizeSum_, + confOffset, + batchSize, + *confGBuffer, + kNHWCToNCHW); + inConfG->add(*confGBuffer); + } + CHECK_EQ(locOffset, locSizeSum_ / batchSize); + CHECK_EQ(confOffset, confSizeSum_ / batchSize); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h new file mode 100644 index 00000000000..9767fed7f1c --- /dev/null +++ b/paddle/gserver/layers/MultiBoxLossLayer.h @@ -0,0 +1,103 @@ +/* copyright (c) 2016 paddlepaddle authors. all rights reserve. + +licensed under the apache license, version 2.0 (the "license"); +you may not use this file except in compliance with the license. +you may obtain a copy of the license at + + http://www.apache.org/licenses/license-2.0 + +unless required by applicable law or agreed to in writing, software +distributed under the license is distributed on an "as is" basis, +without warranties or conditions of any kind, either express or implied. +see the license for the specific language governing permissions and +limitations under the license. */ + +#pragma once + +#include +#include "CostLayer.h" +#include "DataLayer.h" +#include "DetectionUtil.h" +#include "Layer.h" + +using std::vector; +using std::pair; + +namespace paddle { + +/** + * The multibox loss layer for a SSD detection task. + * The loss is composed by the location loss and the confidence loss. + * The location loss is a smooth L1 loss and the confidence loss is + * a softmax loss. + * - Input: This layer need four input layers: This first input layer + * is the priorbox layer and the second layer is a label layer. + * The rest two input layers are convolution layers for generating + * bbox location offset and the classification confidence. + * - Output: The Single Shot Multibox Detection loss value. + * Reference: + * Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, + * Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector + */ + +class MultiBoxLossLayer : public CostLayer { +public: + explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {} + + bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + + void forward(PassType passType); + + void backward(const UpdateCallback& callback = nullptr); + + void forwardImp(Matrix& output, Argument& label, Matrix& cost) {} + + void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {} + +protected: + inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; } + inline LayerPtr getLabelLayer() { return inputLayers_[1]; } + inline LayerPtr getLocInputLayer(size_t index) { + return inputLayers_[2 + index]; + } + inline LayerPtr getConfInputLayer(size_t index) { + return inputLayers_[2 + inputNum_ + index]; + } + +protected: + size_t numClasses_; + real overlapThreshold_; + real negPosRatio_; + real negOverlap_; + size_t inputNum_; + size_t backgroundId_; + + real locLoss_; + real confLoss_; + + size_t numPriors_; + size_t numMatches_; + size_t numNegs_; + size_t numConf_; + size_t locSizeSum_; + size_t confSizeSum_; + + vector> allMatchIndices_; + vector> allNegIndices_; + MatrixPtr locGTData_; + IVectorPtr confGTData_; + + MatrixPtr locBuffer_; + MatrixPtr confBuffer_; + MatrixPtr locDiff_; + MatrixPtr confProb_; + + MatrixPtr labelCpuValue_; + MatrixPtr priorCpuValue_; + MatrixPtr locCpuBuffer_; + MatrixPtr confCpuBuffer_; + MatrixPtr locTmpBuffer_; + MatrixPtr confTmpBuffer_; +}; + +} // namespace paddle diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 3c4128b5b8a..92f6cbcfe5a 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -45,6 +45,13 @@ add_unittest_without_exec(test_PriorBox add_test(NAME test_PriorBox COMMAND test_PriorBox) +################# test_DetectionOutput ####################### +add_unittest_without_exec(test_DetectionOutput + test_DetectionOutput.cpp + LayerGradUtil.cpp) + +add_test(NAME test_DetectionOutput + COMMAND test_DetectionOutput) ################# test_ConvUnify ####################### add_unittest_without_exec(test_ConvUnify test_ConvUnify.cpp diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index a0b1cd471dd..e3591ba4df8 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -387,6 +387,31 @@ void initDataLayer(TestConfig testConf, data.value->sigmoid(*data.value); data.grad->zeroMem(); break; + case INPUT_SELF_DEFINE_DATA: { + size_t height = testConf.inputDefs[i].selfDefinedData->getHeight(); + size_t width = testConf.inputDefs[i].selfDefinedData->getWidth(); + CHECK_GT(static_cast(height), 0); + CHECK_GT(static_cast(width), 0); + data.value = Matrix::create(height, width, false, useGpu); + data.grad = Matrix::create(height, width, false, useGpu); + data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData); + data.grad->zeroMem(); + + const std::vector& labelSeqStartPositions = + testConf.inputDefs[i].labelSeqStartPositions; + if (labelSeqStartPositions.size() != 0) { + CHECK(!sequenceStartPositions); + CHECK_GE(static_cast(labelSeqStartPositions.size()), 2); + + sequenceStartPositions = + ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu); + sequenceStartPositions->copyFrom(labelSeqStartPositions.data(), + labelSeqStartPositions.size(), + useGpu); + data.sequenceStartPositions = sequenceStartPositions; + } + break; + } default: LOG(FATAL) << " unknown inputType "; return; diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h index 9f68eb64d0b..18a6525a145 100644 --- a/paddle/gserver/tests/LayerGradUtil.h +++ b/paddle/gserver/tests/LayerGradUtil.h @@ -31,7 +31,8 @@ enum InputType { INPUT_SEQUENCE_LABEL, INPUT_SPARSE_NON_VALUE_DATA, INPUT_SPARSE_FLOAT_VALUE_DATA, - INPUT_DENSE_DIM_DATA, // using sequence length to init dense data + INPUT_DENSE_DIM_DATA, // using sequence length to init dense data + INPUT_SELF_DEFINE_DATA, // support customizing for input value }; struct ParaSparse { @@ -66,6 +67,7 @@ struct InputDef { bool isStatic; std::vector labelInitValue; std::vector labelSeqStartPositions; + MatrixPtr selfDefinedData; InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) { inputType = type; @@ -76,6 +78,20 @@ struct InputDef { isStatic = false; } + InputDef(InputType type, + string nameIn, + MatrixPtr selfDefinedData, + std::vector selfDefinedSeqStartPos = {}) + : labelSeqStartPositions(selfDefinedSeqStartPos), + selfDefinedData(selfDefinedData) { + inputType = type; + name = nameIn; + dim = 0; + sparse = {""}; + paraSize = 0; + isStatic = false; + } + InputDef(InputType type, string nameIn, size_t dimIn, diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/gserver/tests/test_DetectionOutput.cpp new file mode 100644 index 00000000000..8ec7a284502 --- /dev/null +++ b/paddle/gserver/tests/test_DetectionOutput.cpp @@ -0,0 +1,191 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "LayerGradUtil.h" +#include "paddle/testing/TestUtil.h" + +using namespace paddle; // NOLINT +using namespace std; // NOLINT + +// Do one forward pass of priorBox layer and check to see if its output +// matches the given result +void doOneDetectionOutputTest(MatrixPtr& inputLoc, + MatrixPtr& inputConf, + MatrixPtr& inputPriorBox, + size_t feature_map_width, + size_t feature_map_height, + real nms_threshold, + bool use_gpu, + MatrixPtr& result) { + // Setting up the detection output layer + TestConfig configt; + configt.layerConfig.set_type("detection_output"); + LayerInputConfig* input = configt.layerConfig.add_inputs(); + configt.layerConfig.add_inputs(); + configt.layerConfig.add_inputs(); + + DetectionOutputConfig* detOutput = input->mutable_detection_output_conf(); + detOutput->set_width(feature_map_width); + detOutput->set_height(feature_map_height); + detOutput->set_nms_threshold(nms_threshold); + detOutput->set_num_classes(2); + detOutput->set_nms_top_k(20); + detOutput->set_keep_top_k(10); + detOutput->set_background_id(0); + detOutput->set_confidence_threshold(0.01); + detOutput->set_input_num(1); + configt.inputDefs.push_back({INPUT_DATA_TARGET, "priorbox", 32, 0}); + configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0}); + configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0}); + + // data layer initialize + std::vector dataLayers; + LayerMap layerMap; + vector datas; + initDataLayer( + configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu); + + dataLayers[0]->getOutputValue()->copyFrom(*inputPriorBox); + dataLayers[1]->getOutputValue()->copyFrom(*inputLoc); + dataLayers[2]->getOutputValue()->copyFrom(*inputConf); + + // test layer initialize + std::vector parameters; + LayerPtr detectionOutputLayer; + initTestLayer(configt, &layerMap, ¶meters, &detectionOutputLayer); + detectionOutputLayer->forward(PASS_GC); + checkMatrixEqual(detectionOutputLayer->getOutputValue(), result); +} + +TEST(Layer, detectionOutputLayerFwd) { + bool useGpu = false; + // CPU case 1. + MatrixPtr inputLoc; + MatrixPtr inputConf; + MatrixPtr inputPriorBox; + MatrixPtr result, result2, result3, result4; + real nmsTreshold = 0.01; + real inputLocData[] = {0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1}; + real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6}; + real inputPriorBoxData[] = {0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2, + 0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2, + 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2, + 0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2}; + real resultData[] = { + 0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031}; + inputLoc = Matrix::create(1, 16, false, useGpu); + inputConf = Matrix::create(1, 8, false, useGpu); + inputPriorBox = Matrix::create(1, 32, false, useGpu); + result = Matrix::create(1, 7, false, useGpu); + inputLoc->setData(inputLocData); + inputConf->setData(inputConfData); + inputPriorBox->setData(inputPriorBoxData); + result->setData(resultData); + doOneDetectionOutputTest(inputLoc, + inputConf, + inputPriorBox, + /* feature_map_width */ 1, + /* feature_map_height */ 1, + nmsTreshold, + useGpu, + result); + + // CPU case 2. + nmsTreshold = 0.2; + result2 = Matrix::create(2, 7, false, useGpu); + real resultData2[] = {0, + 1, + 0.68997443, + 0.099959746, + 0.099959746, + 0.50804031, + 0.50804031, + 0, + 1, + 0.59868765, + 0.29995975, + 0.29995975, + 0.70804024, + 0.70804024}; + result2->setData(resultData2); + doOneDetectionOutputTest(inputLoc, + inputConf, + inputPriorBox, + /* feature_map_width */ 1, + /* feature_map_height */ 1, + nmsTreshold, + useGpu, + result2); + +#ifndef PADDLE_ONLY_CPU + // GPU case 1. + useGpu = true; + inputLoc = Matrix::create(1, 16, false, useGpu); + inputConf = Matrix::create(1, 8, false, useGpu); + inputPriorBox = Matrix::create(1, 32, false, useGpu); + inputLoc->copyFrom(inputLocData, 16); + inputConf->copyFrom(inputConfData, 8); + inputPriorBox->copyFrom(inputPriorBoxData, 32); + + nmsTreshold = 0.01; + result3 = Matrix::create(1, 7, false, useGpu); + result3->copyFrom(resultData, 7); + doOneDetectionOutputTest(inputLoc, + inputConf, + inputPriorBox, + /* feature_map_width */ 1, + /* feature_map_height */ 1, + nmsTreshold, + useGpu, + result3); + + // GPU case 2. + nmsTreshold = 0.2; + result4 = Matrix::create(2, 7, false, useGpu); + result4->copyFrom(resultData2, 14); + doOneDetectionOutputTest(inputLoc, + inputConf, + inputPriorBox, + /* feature_map_width */ 1, + /* feature_map_height */ 1, + nmsTreshold, + useGpu, + result4); +#endif +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 6adffcf53b7..9c79bd19ee0 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1689,6 +1689,70 @@ TEST(Layer, smooth_l1) { } } +TEST(Layer, multibox_loss) { + TestConfig config; + config.layerConfig.set_type("multibox_loss"); + config.biasSize = 0; + LayerInputConfig* input = config.layerConfig.add_inputs(); + MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf(); + multiboxLoss->set_num_classes(21); + multiboxLoss->set_input_num(1); + multiboxLoss->set_overlap_threshold(0.5); + multiboxLoss->set_neg_pos_ratio(3); + multiboxLoss->set_neg_overlap(0.5); + multiboxLoss->set_background_id(0); + multiboxLoss->set_height(3); + multiboxLoss->set_width(3); + + size_t gtNum = 1; + MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false); + labelValue->randomizeUniform(); + labelValue->add(-0.5); + labelValue->sigmoid(*labelValue); + real* labelData = labelValue->getData(); + size_t labelWidth = labelValue->getWidth(); + for (size_t i = 0; i < gtNum; ++i) { + *(labelData + i * labelWidth) = std::rand() % 20 + 1; + *(labelData + i * labelWidth + 1) = 0.400259; + *(labelData + i * labelWidth + 2) = 0.377857; + *(labelData + i * labelWidth + 3) = 0.525712; + *(labelData + i * labelWidth + 4) = 0.519368; + } + vector seqStartPositions(gtNum + 1, 0); + for (size_t i = 1; i <= gtNum; ++i) { + seqStartPositions[i] = i; + } + + // Ensure at lease one matched bbox + MatrixPtr priorValue = Matrix::create(1, 72, false, false); + priorValue->randomizeUniform(); + priorValue->add(-0.5); + priorValue->sigmoid(*priorValue); + real* priorData = priorValue->getData(); + *(priorData) = 0.424811; + *(priorData + 1) = 0.397059; + *(priorData + 2) = 0.538905; + *(priorData + 3) = 0.447091; + *(priorData + 4) = 0.425720; + *(priorData + 5) = 0.515228; + *(priorData + 6) = 0.519452; + *(priorData + 7) = 0.591065; + + config.inputDefs.push_back( + {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}}); + config.inputDefs.push_back( + {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions}); + config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0}); + config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0}); + config.layerConfig.add_inputs(); + config.layerConfig.add_inputs(); + config.layerConfig.add_inputs(); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, "multibox_loss", 1, false, useGpu, false); + } +} + TEST(Layer, TransLayer) { TestConfig config; const int height = 128; diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 29270829bbc..3d01c23bf96 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -266,6 +266,29 @@ message PadConfig { repeated uint32 pad_w = 4; } +message MultiBoxLossConfig { + required uint32 num_classes = 1; + required float overlap_threshold = 2; + required float neg_pos_ratio = 3; + required float neg_overlap = 4; + required uint32 background_id = 5; + required uint32 input_num = 6; + optional uint32 height = 7 [default = 1]; + optional uint32 width = 8 [default = 1]; +} + +message DetectionOutputConfig { + required uint32 num_classes = 1; + required float nms_threshold = 2; + required uint32 nms_top_k = 3; + required uint32 background_id = 4; + required uint32 input_num = 5; + required uint32 keep_top_k = 6; + required float confidence_threshold = 7; + optional uint32 height = 8 [default = 1]; + optional uint32 width = 9 [default = 1]; +} + message LayerInputConfig { required string input_layer_name = 1; optional string input_parameter_name = 2; @@ -284,6 +307,8 @@ message LayerInputConfig { optional PriorBoxConfig priorbox_conf = 13; optional PadConfig pad_conf = 14; optional RowConvConfig row_conv_conf = 15; + optional MultiBoxLossConfig multibox_loss_conf = 16; + optional DetectionOutputConfig detection_output_conf = 17; } message LayerConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index fc2e3bbcde0..c46b335d992 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1676,6 +1676,52 @@ class PriorBoxLayer(LayerBase): self.config.size = size +@config_layer('multibox_loss') +class MultiBoxLossLayer(LayerBase): + def __init__(self, name, inputs, input_num, num_classes, overlap_threshold, + neg_pos_ratio, neg_overlap, background_id): + super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0, + inputs) + config_assert( + len(inputs) == (input_num * 2 + 2), + 'MultiBoxLossLayer does not have enough inputs') + config_assert(num_classes > background_id, + 'Classes number must greater than background ID') + self.config.inputs[0].multibox_loss_conf.num_classes = num_classes + self.config.inputs[ + 0].multibox_loss_conf.overlap_threshold = overlap_threshold + self.config.inputs[0].multibox_loss_conf.neg_pos_ratio = neg_pos_ratio + self.config.inputs[0].multibox_loss_conf.neg_overlap = neg_overlap + self.config.inputs[0].multibox_loss_conf.background_id = background_id + self.config.inputs[0].multibox_loss_conf.input_num = input_num + self.config.size = 1 + + +@config_layer('detection_output') +class DetectionOutputLayer(LayerBase): + def __init__(self, name, inputs, size, input_num, num_classes, + nms_threshold, nms_top_k, keep_top_k, confidence_threshold, + background_id): + super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0, + inputs) + config_assert( + len(inputs) == (input_num * 2 + 1), + 'DetectionOutputLayer does not have enough inputs') + config_assert(num_classes > background_id, + 'Classes number must greater than background ID') + self.config.inputs[0].detection_output_conf.num_classes = num_classes + self.config.inputs[ + 0].detection_output_conf.nms_threshold = nms_threshold + self.config.inputs[0].detection_output_conf.nms_top_k = nms_top_k + self.config.inputs[0].detection_output_conf.keep_top_k = keep_top_k + self.config.inputs[ + 0].detection_output_conf.confidence_threshold = confidence_threshold + self.config.inputs[ + 0].detection_output_conf.background_id = background_id + self.config.inputs[0].detection_output_conf.input_num = input_num + self.config.size = size + + @config_layer('data') class DataLayer(LayerBase): def __init__(self, name, size, height=None, width=None, device=None): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 2d8ddbb9007..770559dc770 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -115,6 +115,8 @@ __all__ = [ 'print_layer', 'priorbox_layer', 'cross_channel_norm_layer', + 'multibox_loss_layer', + 'detection_output_layer', 'spp_layer', 'pad_layer', 'eos_layer', @@ -195,6 +197,8 @@ class LayerType(object): PRINT_LAYER = 'print' PRIORBOX_LAYER = 'priorbox' + MULTIBOX_LOSS_LAYER = 'multibox_loss' + DETECTION_OUTPUT_LAYER = 'detection_output' CTC_LAYER = 'ctc' WARP_CTC_LAYER = 'warp_ctc' @@ -1052,6 +1056,163 @@ def priorbox_layer(input, size=size) +@wrap_name_default("multibox_loss") +def multibox_loss_layer(input_loc, + input_conf, + priorbox, + label, + num_classes, + overlap_threshold=0.5, + neg_pos_ratio=3.0, + neg_overlap=0.5, + background_id=0, + name=None): + """ + Compute the location loss and the confidence loss for ssd. + + :param name: The Layer Name. + :type name: basestring + :param input_loc: The input predict location. + :type input_loc: LayerOutput + :param input_conf: The input priorbox confidence. + :type input_conf: LayerOutput + :param priorbox: The input priorbox location and the variance. + :type priorbox: LayerOutput + :param label: The input label. + :type label: LayerOutput + :param num_classes: The number of the classification. + :type num_classes: int + :param overlap_threshold: The threshold of the overlap. + :type overlap_threshold: float + :param neg_pos_ratio: The ratio of the negative bbox to the positive bbox. + :type neg_pos_ratio: float + :param neg_overlap: The negative bbox overlap threshold. + :type neg_overlap: float + :param background_id: The background class index. + :type background_id: int + :return: LayerOutput + """ + input_loc_num = 0 + input_conf_num = 0 + + if isinstance(input_loc, LayerOutput): + input_loc = [input_loc] + assert isinstance(input_loc, collections.Sequence) # list or tuple + for each in input_loc: + assert isinstance(each, LayerOutput) + input_loc_num += 1 + + if isinstance(input_conf, LayerOutput): + input_conf = [input_conf] + assert isinstance(input_conf, collections.Sequence) # list or tuple + for each in input_conf: + assert isinstance(each, LayerOutput) + input_conf_num += 1 + # Check the input layer number. + assert input_loc_num == input_conf_num + + inputs = [priorbox.name, label.name] + inputs.extend([l.name for l in input_loc]) + inputs.extend([l.name for l in input_conf]) + parents = [priorbox, label] + parents.extend(input_loc) + parents.extend(input_conf) + + Layer( + name=name, + type=LayerType.MULTIBOX_LOSS_LAYER, + inputs=inputs, + input_num=input_loc_num, + num_classes=num_classes, + overlap_threshold=overlap_threshold, + neg_pos_ratio=neg_pos_ratio, + neg_overlap=neg_overlap, + background_id=background_id) + return LayerOutput( + name, LayerType.MULTIBOX_LOSS_LAYER, parents=parents, size=1) + + +@wrap_name_default("detection_output") +def detection_output_layer(input_loc, + input_conf, + priorbox, + num_classes, + nms_threshold=0.45, + nms_top_k=400, + keep_top_k=200, + confidence_threshold=0.01, + background_id=0, + name=None): + """ + Apply the NMS to the output of network and compute the predict bounding + box location. + + :param name: The Layer Name. + :type name: basestring + :param input_loc: The input predict location. + :type input_loc: LayerOutput + :param input_conf: The input priorbox confidence. + :type input_conf: LayerOutput + :param priorbox: The input priorbox location and the variance. + :type priorbox: LayerOutput + :param num_classes: The number of the classification. + :type num_classes: int + :param nms_threshold: The Non-maximum suppression threshold. + :type nms_threshold: float + :param nms_top_k: The bbox number kept of the NMS's output + :type nms_top_k: int + :param keep_top_k: The bbox number kept of the layer's output + :type keep_top_k: int + :param confidence_threshold: The classification confidence threshold + :type confidence_threshold: float + :param background_id: The background class index. + :type background_id: int + :return: LayerOutput + """ + input_loc_num = 0 + input_conf_num = 0 + + if isinstance(input_loc, LayerOutput): + input_loc = [input_loc] + assert isinstance(input_loc, collections.Sequence) # list or tuple + for each in input_loc: + assert isinstance(each, LayerOutput) + input_loc_num += 1 + + if isinstance(input_conf, LayerOutput): + input_conf = [input_conf] + assert isinstance(input_conf, collections.Sequence) # list or tuple + for each in input_conf: + assert isinstance(each, LayerOutput) + input_conf_num += 1 + # Check the input layer number. + assert input_loc_num == input_conf_num + + inputs = [priorbox.name] + inputs.extend([l.name for l in input_loc]) + inputs.extend([l.name for l in input_conf]) + parents = [priorbox] + parents.extend(input_loc) + parents.extend(input_conf) + + size = keep_top_k * 7 + + Layer( + name=name, + type=LayerType.DETECTION_OUTPUT_LAYER, + inputs=inputs, + size=size, + input_num=input_loc_num, + num_classes=num_classes, + nms_threshold=nms_threshold, + nms_top_k=nms_top_k, + keep_top_k=keep_top_k, + confidence_threshold=confidence_threshold, + background_id=background_id) + return LayerOutput( + name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size) + + @wrap_name_default("cross_channel_norm") def cross_channel_norm_layer(input, name=None, param_attr=None): """ -- GitLab