提交 7bb627d3 编写于 作者: T Tao Luo 提交者: GitHub

Merge pull request #409 from luotao1/conv

Support rectangle input for CNN
...@@ -60,14 +60,12 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap, ...@@ -60,14 +60,12 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
void BatchNormBaseLayer::calFeatureMapSize() { void BatchNormBaseLayer::calFeatureMapSize() {
const ImageConfig& conf = config_.inputs(0).image_conf(); const ImageConfig& conf = config_.inputs(0).image_conf();
if (inputLayers_[0]->getOutput().getFrameHeight() == 0 &&
inputLayers_[0]->getOutput().getFrameWidth() == 0) {
imgSize_ = conf.img_size();
imageH_ = imgSize_;
imageW_ = imgSize_;
} else {
imageH_ = inputLayers_[0]->getOutput().getFrameHeight(); imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
imageW_ = inputLayers_[0]->getOutput().getFrameWidth(); imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
if (imageH_ == 0 && imageW_ == 0) {
imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
imageW_ = conf.img_size();
} else {
getOutput().setFrameHeight(imageH_); getOutput().setFrameHeight(imageH_);
getOutput().setFrameWidth(imageW_); getOutput().setFrameWidth(imageW_);
} }
......
...@@ -77,9 +77,8 @@ protected: ...@@ -77,9 +77,8 @@ protected:
MatrixPtr savedMean_; MatrixPtr savedMean_;
MatrixPtr savedInvVar_; MatrixPtr savedInvVar_;
/// Height or width of input image feature, now height is equal to width. /// Height or width of input image feature.
/// imgSize is 1 if the input is fully-connected layer. /// Both of them are 1 if the input is fully-connected layer.
int imgSize_;
int imageH_; int imageH_;
int imageW_; int imageW_;
/// Height * Width. /// Height * Width.
......
...@@ -26,15 +26,15 @@ size_t BilinearInterpLayer::getSize() { ...@@ -26,15 +26,15 @@ size_t BilinearInterpLayer::getSize() {
const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf(); const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
if (inImgH_ == 0) { if (inImgH_ == 0) {
inImgH_ = conf.img_size_y(); inImgH_ = conf.image_conf().img_size_y();
} }
if (inImgW_ == 0) { if (inImgW_ == 0) {
inImgW_ = conf.img_size_x(); inImgW_ = conf.image_conf().img_size();
} }
outImgH_ = conf.out_size_y(); outImgH_ = conf.out_size_y();
outImgW_ = conf.out_size_x(); outImgW_ = conf.out_size_x();
numChannels_ = conf.num_channels(); numChannels_ = conf.image_conf().channels();
CHECK(outImgH_ > 0 && outImgW_ > 0); CHECK(outImgH_ > 0 && outImgW_ > 0);
CHECK(inImgH_ > 0 && inImgW_ > 0); CHECK(inImgH_ > 0 && inImgW_ > 0);
......
...@@ -38,11 +38,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap, ...@@ -38,11 +38,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
filterSizeY_.push_back(conf.filter_size_y()); filterSizeY_.push_back(conf.filter_size_y());
filterPixels_.push_back(filterSize_.back() * filterSizeY_.back()); filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
channels_.push_back(conf.channels()); channels_.push_back(conf.channels());
imgSizeH_.push_back(conf.img_size()); imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
: conf.img_size());
imgSizeW_.push_back(conf.img_size()); imgSizeW_.push_back(conf.img_size());
groups_.push_back(conf.groups()); groups_.push_back(conf.groups());
filterChannels_.push_back(conf.filter_channels()); filterChannels_.push_back(conf.filter_channels());
outputH_.push_back(conf.output_x()); outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
outputW_.push_back(conf.output_x()); outputW_.push_back(conf.output_x());
} }
...@@ -91,16 +92,19 @@ size_t ConvBaseLayer::calOutputSize() { ...@@ -91,16 +92,19 @@ size_t ConvBaseLayer::calOutputSize() {
for (size_t i = 0; i < inputLayers_.size(); i++) { for (size_t i = 0; i < inputLayers_.size(); i++) {
inH.push_back(inputLayers_[i]->getOutput().getFrameHeight()); inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
inW.push_back(inputLayers_[i]->getOutput().getFrameWidth()); inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
const ConvConfig& conf = config_.inputs(i).conv_conf();
if (isDeconv_) { if (isDeconv_) {
if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().output_x(); if (inH[i] == 0)
if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().output_x(); inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
if (inW[i] == 0) inW[i] = conf.output_x();
outH.push_back(imageSize( outH.push_back(imageSize(
inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_)); inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
outW.push_back(imageSize( outW.push_back(imageSize(
inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_)); inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
} else { } else {
if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().img_size(); if (inH[i] == 0)
if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().img_size(); inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
if (inW[i] == 0) inW[i] = conf.img_size();
outH.push_back(outputSize( outH.push_back(outputSize(
inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_)); inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
outW.push_back(outputSize( outW.push_back(outputSize(
......
...@@ -93,9 +93,9 @@ private: ...@@ -93,9 +93,9 @@ private:
bool caffeMode_; bool caffeMode_;
int inputOffset_, outputOffset_, weightOffset_; int inputOffset_, outputOffset_, weightOffset_;
int numFilters_; int numFilters_;
int padding_, stride_, filterSize_, channels_, imgSize_; int padding_, stride_, filterSize_, channels_, imgSize_, imgSizeY_;
int paddingY_, strideY_, filterSizeY_; int paddingY_, strideY_, filterSizeY_;
int imgPixels_, filterPixels_, filterChannels_, outputX_, outputs_; int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
/// Following member variables are same with CudnnConvLayer. /// Following member variables are same with CudnnConvLayer.
/// There is no explanation here. /// There is no explanation here.
...@@ -144,7 +144,7 @@ void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) { ...@@ -144,7 +144,7 @@ void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
void ConvOperator::reshape(int batchSize) { void ConvOperator::reshape(int batchSize) {
imageH_ = ins_[0]->getFrameHeight(); imageH_ = ins_[0]->getFrameHeight();
imageW_ = ins_[0]->getFrameWidth(); imageW_ = ins_[0]->getFrameWidth();
if (imageH_ == 0) imageH_ = imgSize_; if (imageH_ == 0) imageH_ = imgSizeY_;
if (imageW_ == 0) imageW_ = imgSize_; if (imageW_ == 0) imageW_ = imgSize_;
outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_); outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_); outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
...@@ -182,7 +182,10 @@ void ConvOperator::computeConvSizes() { ...@@ -182,7 +182,10 @@ void ConvOperator::computeConvSizes() {
hl_create_tensor_descriptor(&inputDesc_); hl_create_tensor_descriptor(&inputDesc_);
int outputX = int outputX =
outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_); outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_);
int outputY =
outputSize(imgSizeY_, filterSizeY_, paddingY_, strideY_, caffeMode_);
CHECK_EQ(outputX, outputX_); CHECK_EQ(outputX, outputX_);
CHECK_EQ(outputY, outputY_);
hl_create_tensor_descriptor(&outputDesc_); hl_create_tensor_descriptor(&outputDesc_);
hl_create_convolution_descriptor(&convDesc_, hl_create_convolution_descriptor(&convDesc_,
inputDesc_, inputDesc_,
...@@ -236,10 +239,12 @@ void ConvOperator::getConvParams() { ...@@ -236,10 +239,12 @@ void ConvOperator::getConvParams() {
filterPixels_ = filterSize_ * filterSizeY_; filterPixels_ = filterSize_ * filterSizeY_;
channels_ = conf.channels(); channels_ = conf.channels();
imgSize_ = conf.img_size(); imgSize_ = conf.img_size();
imgPixels_ = imgSize_ * imgSize_; imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
imgPixels_ = imgSize_ * imgSizeY_;
CHECK_EQ(conf.groups(), 1U); CHECK_EQ(conf.groups(), 1U);
filterChannels_ = conf.filter_channels(); filterChannels_ = conf.filter_channels();
outputX_ = conf.output_x(); outputX_ = conf.output_x();
outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
outputs_ = outputX_ * outputX_; outputs_ = outputX_ * outputX_;
} }
......
...@@ -46,7 +46,7 @@ void ConvProjection::getConvParams() { ...@@ -46,7 +46,7 @@ void ConvProjection::getConvParams() {
filterH_ = conf.filter_size_y(); filterH_ = conf.filter_size_y();
filterW_ = conf.filter_size(); filterW_ = conf.filter_size();
configImgH_ = conf.img_size(); configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
configImgW_ = conf.img_size(); configImgW_ = conf.img_size();
channels_ = conf.channels(); channels_ = conf.channels();
...@@ -58,9 +58,11 @@ void ConvProjection::getConvParams() { ...@@ -58,9 +58,11 @@ void ConvProjection::getConvParams() {
} }
void ConvProjection::initCudnn() { void ConvProjection::initCudnn() {
hl_create_filter_descriptor( hl_create_filter_descriptor(&filterDesc_,
&filterDesc_, channels_ / groups_, numFilters_ / groups_, channels_ / groups_,
filterH_, filterW_); numFilters_ / groups_,
filterH_,
filterW_);
hl_create_tensor_descriptor(&inputDesc_); hl_create_tensor_descriptor(&inputDesc_);
hl_create_tensor_descriptor(&outputDesc_); hl_create_tensor_descriptor(&outputDesc_);
hl_create_convolution_descriptor(&convDesc_, hl_create_convolution_descriptor(&convDesc_,
......
...@@ -49,8 +49,13 @@ void DataLayer::copyDataToOutput(Argument& output) { ...@@ -49,8 +49,13 @@ void DataLayer::copyDataToOutput(Argument& output) {
output.ids->copyFrom(*data_.ids); output.ids->copyFrom(*data_.ids);
} }
} }
if (config_.height() && config_.width()) {
output.setFrameHeight(config_.height());
output.setFrameWidth(config_.width());
} else {
output.setFrameHeight(data_.getFrameHeight());
output.setFrameHeight(data_.getFrameHeight()); output.setFrameHeight(data_.getFrameHeight());
output.setFrameWidth(data_.getFrameWidth()); }
output.cpuSequenceDims = data_.cpuSequenceDims; output.cpuSequenceDims = data_.cpuSequenceDims;
output.sequenceStartPositions = data_.sequenceStartPositions; output.sequenceStartPositions = data_.sequenceStartPositions;
output.subSequenceStartPositions = data_.subSequenceStartPositions; output.subSequenceStartPositions = data_.subSequenceStartPositions;
......
...@@ -29,16 +29,18 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap, ...@@ -29,16 +29,18 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
* meaning as in conv, we need to swap channels_ and numFilters here for * meaning as in conv, we need to swap channels_ and numFilters here for
* convTrans, and in other functions too. * convTrans, and in other functions too.
* */ * */
int channel;
int numFilters;
/* Initialize the projection */ /* Initialize the projection */
for (auto &inputConfig : config_.inputs()) { for (auto &inputConfig : config_.inputs()) {
const ConvConfig &conf = inputConfig.conv_conf(); const ConvConfig &conf = inputConfig.conv_conf();
numFilters = isDeconv_ ? conf.channels() : numFilters_; int numFilters = isDeconv_ ? conf.channels() : numFilters_;
subM_.push_back(numFilters / conf.groups()); subM_.push_back(numFilters / conf.groups());
subN_.push_back(conf.output_x() * conf.output_x()); subN_.push_back(conf.output_x() *
channel = isDeconv_ ? numFilters_ : conf.channels(); (conf.has_output_y() ? conf.output_y() : conf.output_x()));
subK_.push_back(channel * conf.filter_size() * conf.filter_size() / int channel = isDeconv_ ? numFilters_ : conf.channels();
subK_.push_back(
channel * conf.filter_size() *
(conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) /
conf.groups()); conf.groups());
/* Consistent caffe mode for multiple input */ /* Consistent caffe mode for multiple input */
caffeMode_ = conf.caffe_mode(); caffeMode_ = conf.caffe_mode();
...@@ -116,11 +118,11 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image, ...@@ -116,11 +118,11 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
imgSizeH_[inIdx], imgSizeH_[inIdx],
imgSizeW_[inIdx], imgSizeW_[inIdx],
channel, channel,
filterSizeY_[inIdx],
filterSize_[inIdx], filterSize_[inIdx],
filterSize_[inIdx], strideY_[inIdx],
stride_[inIdx], stride_[inIdx],
stride_[inIdx], paddingY_[inIdx],
padding_[inIdx],
padding_[inIdx], padding_[inIdx],
outputH_[inIdx], outputH_[inIdx],
outputW_[inIdx]); outputW_[inIdx]);
...@@ -208,11 +210,11 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, ...@@ -208,11 +210,11 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
imgSizeH_[inpIdx], imgSizeH_[inpIdx],
imgSizeW_[inpIdx], imgSizeW_[inpIdx],
channel, channel,
filterSizeY_[inpIdx],
filterSize_[inpIdx], filterSize_[inpIdx],
filterSize_[inpIdx], strideY_[inpIdx],
stride_[inpIdx],
stride_[inpIdx], stride_[inpIdx],
padding_[inpIdx], paddingY_[inpIdx],
padding_[inpIdx], padding_[inpIdx],
outputH_[inpIdx], outputH_[inpIdx],
outputW_[inpIdx], outputW_[inpIdx],
......
...@@ -25,10 +25,10 @@ size_t MaxOutLayer::getSize() { ...@@ -25,10 +25,10 @@ size_t MaxOutLayer::getSize() {
imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight(); imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth(); imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
if (imgSizeH_ == 0) { if (imgSizeH_ == 0) {
imgSizeH_ = maxoutConf.img_size_y(); imgSizeH_ = maxoutConf.image_conf().img_size_y();
} }
if (imgSizeW_ == 0) { if (imgSizeW_ == 0) {
imgSizeW_ = maxoutConf.img_size_x(); imgSizeW_ = maxoutConf.image_conf().img_size();
} }
featLen_ = imgSizeH_ * imgSizeW_; featLen_ = imgSizeH_ * imgSizeW_;
...@@ -50,7 +50,7 @@ bool MaxOutLayer::init(const LayerMap& layerMap, ...@@ -50,7 +50,7 @@ bool MaxOutLayer::init(const LayerMap& layerMap,
const MaxOutConfig& conf = config_.inputs(0).maxout_conf(); const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
groups_ = conf.groups(); groups_ = conf.groups();
channels_ = conf.channels(); channels_ = conf.image_conf().channels();
CHECK_EQ(channels_ % groups_, 0UL); CHECK_EQ(channels_ % groups_, 0UL);
outputChannels_ = channels_ / groups_; outputChannels_ = channels_ / groups_;
......
...@@ -48,6 +48,9 @@ bool ResponseNormLayer::init(const LayerMap& layerMap, ...@@ -48,6 +48,9 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
outputX_ = conf.output_x(); outputX_ = conf.output_x();
imgSize_ = conf.img_size(); imgSize_ = conf.img_size();
denoms_ = NULL; denoms_ = NULL;
outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
return true; return true;
} }
......
...@@ -49,7 +49,7 @@ public: ...@@ -49,7 +49,7 @@ public:
*/ */
class ResponseNormLayer : public NormLayer { class ResponseNormLayer : public NormLayer {
protected: protected:
size_t channels_, size_, outputX_, imgSize_; size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
float scale_, pow_; float scale_, pow_;
MatrixPtr denoms_; MatrixPtr denoms_;
......
...@@ -23,7 +23,7 @@ size_t CMRProjectionNormLayer::getSize() { ...@@ -23,7 +23,7 @@ size_t CMRProjectionNormLayer::getSize() {
imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight(); imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth(); imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
if (imgSizeH_ == 0) { if (imgSizeH_ == 0) {
imgSizeH_ = imgSize_; imgSizeH_ = imgSizeY_;
} }
if (imgSizeW_ == 0) { if (imgSizeW_ == 0) {
imgSizeW_ = imgSize_; imgSizeW_ = imgSize_;
......
...@@ -56,14 +56,14 @@ ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW, ...@@ -56,14 +56,14 @@ ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW,
size_t SpatialPyramidPoolLayer::getSize() { size_t SpatialPyramidPoolLayer::getSize() {
CHECK_EQ(inputLayers_.size(), 1UL); CHECK_EQ(inputLayers_.size(), 1UL);
size_t layerSize = 0; size_t layerSize = 0;
const SppConfig& sppConf = config_.inputs(0).spp_conf(); const ImageConfig& conf = config_.inputs(0).spp_conf().image_conf();
imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight(); imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth(); imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
if (imgSizeH_ == 0) { if (imgSizeH_ == 0) {
imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_; imgSizeH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
} }
if (imgSizeW_ == 0) { if (imgSizeW_ == 0) {
imgSizeW_ = sppConf.img_size(); imgSizeW_ = conf.img_size();
} }
size_t outputH = 1; size_t outputH = 1;
...@@ -82,9 +82,10 @@ bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap, ...@@ -82,9 +82,10 @@ bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap,
pyramidHeight_ = sppConf.pyramid_height(); pyramidHeight_ = sppConf.pyramid_height();
poolType_ = sppConf.pool_type(); poolType_ = sppConf.pool_type();
channels_ = sppConf.channels(); const ImageConfig& imageConf = sppConf.image_conf();
imgSizeW_ = sppConf.img_size(); channels_ = imageConf.channels();
imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_; imgSizeW_ = imageConf.img_size();
imgSizeH_ = imageConf.has_img_size_y() ? imageConf.img_size_y() : imgSizeW_;
poolProjections_.reserve(pyramidHeight_); poolProjections_.reserve(pyramidHeight_);
projCol_.reserve(pyramidHeight_); projCol_.reserve(pyramidHeight_);
projOutput_.resize(pyramidHeight_); projOutput_.resize(pyramidHeight_);
......
...@@ -28,7 +28,6 @@ maxpool = img_pool_layer(input=conv, ...@@ -28,7 +28,6 @@ maxpool = img_pool_layer(input=conv,
stride_y=2, stride_y=2,
padding=1, padding=1,
padding_y=2, padding_y=2,
img_width=16,
pool_type=MaxPooling(), pool_type=MaxPooling(),
) )
avgpool = img_pool_layer(input=conv, avgpool = img_pool_layer(input=conv,
...@@ -39,7 +38,6 @@ avgpool = img_pool_layer(input=conv, ...@@ -39,7 +38,6 @@ avgpool = img_pool_layer(input=conv,
stride_y=2, stride_y=2,
padding=1, padding=1,
padding_y=2, padding_y=2,
img_width=16,
pool_type=AvgPooling(), pool_type=AvgPooling(),
) )
......
...@@ -202,11 +202,10 @@ void testProjectionConv(size_t groups) { ...@@ -202,11 +202,10 @@ void testProjectionConv(size_t groups) {
conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS); conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
conf.set_output_size(output_x * output_y * NUM_FILTERS); conf.set_output_size(output_x * output_y * NUM_FILTERS);
testProjectionGrad( testProjectionGrad(conf,
conf,
INPUT_DATA, INPUT_DATA,
/* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
/ groups, FILTER_SIZE_Y / groups,
/* batchSize */ 100, /* batchSize */ 100,
true, true,
false, false,
...@@ -229,9 +228,10 @@ TEST(Layer, BilinearInterpLayer) { ...@@ -229,9 +228,10 @@ TEST(Layer, BilinearInterpLayer) {
LayerInputConfig* input = config.layerConfig.add_inputs(); LayerInputConfig* input = config.layerConfig.add_inputs();
BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf(); BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
bilinear->set_img_size_x(32); ImageConfig* image = bilinear->mutable_image_conf();
bilinear->set_img_size_y(32); image->set_img_size(32);
bilinear->set_num_channels(4); image->set_img_size_y(32);
image->set_channels(4);
for (auto useGpu : {false, true}) { for (auto useGpu : {false, true}) {
for (auto outSize : {32, 64}) { for (auto outSize : {32, 64}) {
...@@ -354,7 +354,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) { ...@@ -354,7 +354,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
config.layerConfig.set_partial_sum(1); config.layerConfig.set_partial_sum(1);
config.layerConfig.set_shared_biases(true); config.layerConfig.set_shared_biases(true);
config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 288}); config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
LayerInputConfig* input = config.layerConfig.add_inputs(); LayerInputConfig* input = config.layerConfig.add_inputs();
ConvConfig* conv = input->mutable_conv_conf(); ConvConfig* conv = input->mutable_conv_conf();
conv->set_filter_size(2); conv->set_filter_size(2);
...@@ -367,12 +367,18 @@ void testConvLayer(const string& type, bool trans, bool useGpu) { ...@@ -367,12 +367,18 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
conv->set_groups(1); conv->set_groups(1);
conv->set_filter_channels(conv->channels() / conv->groups()); conv->set_filter_channels(conv->channels() / conv->groups());
conv->set_img_size(16); conv->set_img_size(16);
conv->set_img_size_y(8);
conv->set_output_x(outputSize(conv->img_size(), conv->set_output_x(outputSize(conv->img_size(),
conv->filter_size(), conv->filter_size(),
conv->padding(), conv->padding(),
conv->stride(), conv->stride(),
/* caffeMode */ true)); /* caffeMode */ true));
config.layerConfig.set_size(conv->output_x() * conv->output_x() * conv->set_output_y(outputSize(conv->img_size_y(),
conv->filter_size_y(),
conv->padding_y(),
conv->stride_y(),
/* caffeMode */ true));
config.layerConfig.set_size(conv->output_x() * conv->output_y() *
config.layerConfig.num_filters()); config.layerConfig.num_filters());
testLayerGrad(config, "conv", 100, trans, useGpu); testLayerGrad(config, "conv", 100, trans, useGpu);
...@@ -472,10 +478,11 @@ TEST(Layer, maxoutLayer) { ...@@ -472,10 +478,11 @@ TEST(Layer, maxoutLayer) {
config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0}); config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
LayerInputConfig* input = config.layerConfig.add_inputs(); LayerInputConfig* input = config.layerConfig.add_inputs();
MaxOutConfig* maxout = input->mutable_maxout_conf(); MaxOutConfig* maxout = input->mutable_maxout_conf();
ImageConfig* image = maxout->mutable_image_conf();
maxout->set_img_size_x(32); image->set_img_size(32);
maxout->set_img_size_y(32); image->set_img_size_y(32);
maxout->set_channels(4); image->set_channels(4);
maxout->set_groups(2); maxout->set_groups(2);
for (auto useGpu : {false, true}) { for (auto useGpu : {false, true}) {
...@@ -987,7 +994,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) { ...@@ -987,7 +994,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
config.layerConfig.set_type("norm"); config.layerConfig.set_type("norm");
config.layerConfig.set_active_type("relu"); config.layerConfig.set_active_type("relu");
config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0}); config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
LayerInputConfig* input = config.layerConfig.add_inputs(); LayerInputConfig* input = config.layerConfig.add_inputs();
NormConfig* norm = input->mutable_norm_conf(); NormConfig* norm = input->mutable_norm_conf();
norm->set_norm_type(normType); norm->set_norm_type(normType);
...@@ -997,7 +1004,9 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) { ...@@ -997,7 +1004,9 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
norm->set_pow(0.75); norm->set_pow(0.75);
norm->set_blocked(0); norm->set_blocked(0);
norm->set_img_size(14); norm->set_img_size(14);
norm->set_img_size_y(7);
norm->set_output_x(norm->img_size()); norm->set_output_x(norm->img_size());
norm->set_output_y(norm->img_size_y());
if (norm->norm_type() == "cmrnorm" || if (norm->norm_type() == "cmrnorm" ||
norm->norm_type() == "cmrnorm-projection") { norm->norm_type() == "cmrnorm-projection") {
norm->set_scale(norm->scale() / norm->size()); norm->set_scale(norm->scale() / norm->size());
...@@ -1005,7 +1014,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) { ...@@ -1005,7 +1014,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
norm->set_scale(norm->scale() / (norm->size() * norm->size())); norm->set_scale(norm->scale() / (norm->size() * norm->size()));
} }
config.layerConfig.set_size(norm->output_x() * norm->output_x() * config.layerConfig.set_size(norm->output_x() * norm->output_y() *
norm->channels()); norm->channels());
config.biasSize = 0; config.biasSize = 0;
...@@ -1106,11 +1115,12 @@ void testSppLayer(const string& poolType, ...@@ -1106,11 +1115,12 @@ void testSppLayer(const string& poolType,
SppConfig* sppConfig = input->mutable_spp_conf(); SppConfig* sppConfig = input->mutable_spp_conf();
sppConfig->set_pool_type(poolType); sppConfig->set_pool_type(poolType);
sppConfig->set_pyramid_height(pyramidHeight); sppConfig->set_pyramid_height(pyramidHeight);
sppConfig->set_channels(16); ImageConfig* imageConfig = sppConfig->mutable_image_conf();
sppConfig->set_img_size(10); imageConfig->set_channels(16);
sppConfig->set_img_size_y(20); imageConfig->set_img_size(10);
imageConfig->set_img_size_y(20);
int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1); int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
config.layerConfig.set_size(outputSize * sppConfig->channels()); config.layerConfig.set_size(outputSize * imageConfig->channels());
testLayerGrad(config, "spp", 100, trans, useGpu); testLayerGrad(config, "spp", 100, trans, useGpu);
} }
...@@ -1420,13 +1430,15 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) { ...@@ -1420,13 +1430,15 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
TestConfig config; TestConfig config;
const int CHANNELS = 10; const int CHANNELS = 10;
const int IMG_SIZE = 16; const int IMG_SIZE = 16;
const int IMG_SIZE_Y = 8;
size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
config.layerConfig.set_type(type); config.layerConfig.set_type(type);
config.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE); config.layerConfig.set_size(size);
config.layerConfig.set_active_type("sigmoid"); config.layerConfig.set_active_type("sigmoid");
config.biasSize = CHANNELS; config.biasSize = CHANNELS;
config.inputDefs.push_back({INPUT_DATA, config.inputDefs.push_back({INPUT_DATA,
"layer_0", "layer_0",
/* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS, /* dim= */ size,
/* paraSize= */ CHANNELS}); /* paraSize= */ CHANNELS});
config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS}); config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
...@@ -1441,6 +1453,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) { ...@@ -1441,6 +1453,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
ImageConfig* img_conf = input->mutable_image_conf(); ImageConfig* img_conf = input->mutable_image_conf();
img_conf->set_channels(CHANNELS); img_conf->set_channels(CHANNELS);
img_conf->set_img_size(IMG_SIZE); img_conf->set_img_size(IMG_SIZE);
img_conf->set_img_size_y(IMG_SIZE_Y);
testLayerGrad(config, testLayerGrad(config,
"batch_norm", "batch_norm",
...@@ -1467,6 +1480,7 @@ TEST(Operator, conv) { ...@@ -1467,6 +1480,7 @@ TEST(Operator, conv) {
const int FILTER_SIZE_Y = 3; const int FILTER_SIZE_Y = 3;
const int CHANNELS = 3; const int CHANNELS = 3;
const int IMAGE_SIZE = 16; const int IMAGE_SIZE = 16;
const int IMAGE_SIZE_Y = 8;
OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs(); OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
operatorConf.set_type("conv"); operatorConf.set_type("conv");
ConvConfig* conv = operatorConf.mutable_conv_conf(); ConvConfig* conv = operatorConf.mutable_conv_conf();
...@@ -1481,19 +1495,22 @@ TEST(Operator, conv) { ...@@ -1481,19 +1495,22 @@ TEST(Operator, conv) {
conv->set_groups(1); conv->set_groups(1);
conv->set_filter_channels(conv->channels() / conv->groups()); conv->set_filter_channels(conv->channels() / conv->groups());
conv->set_img_size(IMAGE_SIZE); conv->set_img_size(IMAGE_SIZE);
int output_x = outputSize(conv->img_size(), conv->set_img_size_y(IMAGE_SIZE_Y);
conv->set_output_x(outputSize(conv->img_size(),
conv->filter_size(), conv->filter_size(),
conv->padding(), conv->padding(),
conv->stride(), conv->stride(),
/* caffeMode */ true); /* caffeMode */ true));
conv->set_output_x(output_x); conv->set_output_y(outputSize(conv->img_size_y(),
config.layerConfig.set_size(output_x * output_x * conv->filter_size_y(),
config.layerConfig.num_filters()); conv->padding_y(),
config.layerConfig.set_size(conv->output_x() * conv->output_x() * conv->stride_y(),
/* caffeMode */ true));
config.layerConfig.set_size(conv->output_x() * conv->output_y() *
NUM_FILTERS); NUM_FILTERS);
config.inputDefs.push_back( config.inputDefs.push_back(
{INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE * CHANNELS, 0}); {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
config.inputDefs.push_back( config.inputDefs.push_back(
{INPUT_DATA, {INPUT_DATA,
"layer_1", "layer_1",
......
...@@ -225,6 +225,8 @@ void Argument::resizeAndCopyFrom(const Argument& src, ...@@ -225,6 +225,8 @@ void Argument::resizeAndCopyFrom(const Argument& src,
} }
resizeAndCopy(udp, src.udp, useGpu, stream); resizeAndCopy(udp, src.udp, useGpu, stream);
resizeAndCopy(strs, src.strs, useGpu, stream); resizeAndCopy(strs, src.strs, useGpu, stream);
frameWidth = src.frameWidth;
frameHeight = src.frameHeight;
} }
int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t Argument::resizeAndCopyFrom(const Argument& src,
......
...@@ -59,7 +59,6 @@ pool = img_pool_layer(input=fc2, ...@@ -59,7 +59,6 @@ pool = img_pool_layer(input=fc2,
padding_y=2, padding_y=2,
stride=2, stride=2,
stride_y=3, stride_y=3,
img_width=3,
pool_type=CudnnAvgPooling()) pool_type=CudnnAvgPooling())
concat = concat_layer(input=[fc3, fc4]) concat = concat_layer(input=[fc3, fc4])
......
...@@ -77,6 +77,12 @@ message ConvConfig { ...@@ -77,6 +77,12 @@ message ConvConfig {
required uint32 filter_size_y = 10; required uint32 filter_size_y = 10;
required uint32 padding_y = 11; required uint32 padding_y = 11;
required uint32 stride_y = 12; required uint32 stride_y = 12;
// if not set, use output_x
optional uint32 output_y = 13;
// if not set, use img_size
optional uint32 img_size_y = 14;
} }
message PoolConfig { message PoolConfig {
...@@ -122,11 +128,9 @@ message PoolConfig { ...@@ -122,11 +128,9 @@ message PoolConfig {
} }
message SppConfig { message SppConfig {
required string pool_type = 1; required ImageConfig image_conf = 1;
required uint32 pyramid_height = 2; required string pool_type = 2;
required uint32 channels = 3; required uint32 pyramid_height = 3;
required uint32 img_size = 4;
optional uint32 img_size_y = 5;
} }
message NormConfig { message NormConfig {
...@@ -156,6 +160,12 @@ message NormConfig { ...@@ -156,6 +160,12 @@ message NormConfig {
// fixed window: shared a fixed window for each value // fixed window: shared a fixed window for each value
// sliding window: have a different window for each value // sliding window: have a different window for each value
optional bool blocked = 8; optional bool blocked = 8;
// if not set, use output_x
optional uint32 output_y = 9;
// if not set, use img_size
optional uint32 img_size_y = 10;
} }
message BlockExpandConfig { message BlockExpandConfig {
...@@ -180,12 +190,8 @@ message BlockExpandConfig { ...@@ -180,12 +190,8 @@ message BlockExpandConfig {
} }
message MaxOutConfig { message MaxOutConfig {
required uint32 channels = 1; required ImageConfig image_conf = 1;
required uint32 groups = 2; required uint32 groups = 2;
// The size of input feature map.
required uint32 img_size_x = 3;
required uint32 img_size_y = 4;
} }
message ProjectionConfig { message ProjectionConfig {
...@@ -226,12 +232,10 @@ message OperatorConfig { ...@@ -226,12 +232,10 @@ message OperatorConfig {
message BilinearInterpConfig { message BilinearInterpConfig {
// The size of input feature map. // The size of input feature map.
optional uint32 img_size_x = 1; required ImageConfig image_conf = 1;
optional uint32 img_size_y = 2;
// The size of output feature map. // The size of output feature map.
required uint32 out_size_x = 3; required uint32 out_size_x = 2;
required uint32 out_size_y = 4; required uint32 out_size_y = 3;
required uint32 num_channels = 5;
} }
message ImageConfig { message ImageConfig {
...@@ -241,6 +245,7 @@ message ImageConfig { ...@@ -241,6 +245,7 @@ message ImageConfig {
// The size of input feature map. // The size of input feature map.
required uint32 img_size = 8; required uint32 img_size = 8;
required uint32 img_size_y = 9;
} }
message LayerInputConfig { message LayerInputConfig {
...@@ -414,6 +419,9 @@ sinclude(`ModelConfigLayer.proto.m4') ...@@ -414,6 +419,9 @@ sinclude(`ModelConfigLayer.proto.m4')
// to string and reinterpreted in the user's own layer implementation. // to string and reinterpreted in the user's own layer implementation.
optional string user_arg = 49; optional string user_arg = 49;
// to indicate rectangle image data
optional uint64 height = 50;
optional uint64 width = 51;
} }
message EvaluatorConfig { message EvaluatorConfig {
......
...@@ -138,7 +138,14 @@ def init_config_environment( ...@@ -138,7 +138,14 @@ def init_config_environment(
g_root_submodel=None, g_root_submodel=None,
g_submodel_map={}, g_submodel_map={},
g_submodel_stack=[], g_submodel_stack=[],
g_add_submodel_suffix=False, ): g_add_submodel_suffix=False,
# Whether current layer needs to pass the image height and width.
# Default value is true, but if it encounters recurrent_layer_group,
# it will be false. The reason is that image is converted to be sequence,
# image height will be sequence length, and image width will be feature
# length of each timestep.
g_pass_height_width=True, ):
for k, v in locals().iteritems(): for k, v in locals().iteritems():
globals()[k] = copy.deepcopy(v) globals()[k] = copy.deepcopy(v)
...@@ -686,9 +693,9 @@ class ConvProjection(Projection): ...@@ -686,9 +693,9 @@ class ConvProjection(Projection):
parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf, parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf,
num_filters) num_filters)
# TODO: support rectangle input self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x self.proj_conf.conv_conf.output_y * \
**2) * num_filters num_filters
def calc_output_size(self, input_layer_config): def calc_output_size(self, input_layer_config):
return self.proj_conf.output_size return self.proj_conf.output_size
...@@ -764,8 +771,9 @@ class ConvOperator(Operator): ...@@ -764,8 +771,9 @@ class ConvOperator(Operator):
parse_conv(conv_conf, parse_conv(conv_conf,
MakeLayerNameInSubmodel(input_layer_names[0]), MakeLayerNameInSubmodel(input_layer_names[0]),
self.operator_conf.conv_conf, num_filters) self.operator_conf.conv_conf, num_filters)
self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x self.operator_conf.output_size = self.operator_conf.conv_conf.output_x * \
**2) * num_filters self.operator_conf.conv_conf.output_y * \
num_filters
config_assert(len(input_layer_names) == 2, "Conv is binary operator") config_assert(len(input_layer_names) == 2, "Conv is binary operator")
...@@ -800,14 +808,12 @@ class Conv(Cfg): ...@@ -800,14 +808,12 @@ class Conv(Cfg):
config_assert(output_x <= 0) config_assert(output_x <= 0)
# please refer to the comments in proto/ModelConfig.proto
@config_class @config_class
class BilinearInterp(Cfg): class BilinearInterp(Cfg):
def __init__(self, out_size_x=None, out_size_y=None, num_channels=None): def __init__(self, out_size_x=None, out_size_y=None, channels=None):
self.add_keys(locals()) self.add_keys(locals())
# please refer to the comments in proto/ModelConfig.proto
@config_class @config_class
class Pool(Cfg): class Pool(Cfg):
def __init__( def __init__(
...@@ -825,14 +831,12 @@ class Pool(Cfg): ...@@ -825,14 +831,12 @@ class Pool(Cfg):
self.add_keys(locals()) self.add_keys(locals())
# please refer to the comments in proto/ModelConfig.proto
@config_class @config_class
class SpatialPyramidPool(Cfg): class SpatialPyramidPool(Cfg):
def __init__(self, pool_type, pyramid_height, channels, img_width=None): def __init__(self, pool_type, pyramid_height, channels):
self.add_keys(locals()) self.add_keys(locals())
# please refer to the comments in proto/ModelConfig.proto
@config_class @config_class
class Norm(Cfg): class Norm(Cfg):
def __init__(self, def __init__(self,
...@@ -847,7 +851,6 @@ class Norm(Cfg): ...@@ -847,7 +851,6 @@ class Norm(Cfg):
self.add_keys(locals()) self.add_keys(locals())
# please refer to the comments in proto/ModelConfig.proto
@config_class @config_class
class Image(Cfg): class Image(Cfg):
def __init__(self, channels, img_size=None): def __init__(self, channels, img_size=None):
...@@ -1054,18 +1057,8 @@ def TestData(data_config, async_load_data=None): ...@@ -1054,18 +1057,8 @@ def TestData(data_config, async_load_data=None):
g_config.test_data_config.async_load_data = async_load_data g_config.test_data_config.async_load_data = async_load_data
def parse_bilinear(bilinear, input_layer_name, bilinear_conf): #caffe_mode: compute the output size using floor instead of ceil,
bilinear_conf.out_size_x = bilinear.out_size_x # which is consistent of caffe and CuDNN's convention.
bilinear_conf.out_size_y = bilinear.out_size_y
bilinear_conf.num_channels = bilinear.num_channels
'''
caffe_mode: compute the output size using floor instead of ceil,
which is consistent of caffe and CuDNN's convention.
'''
def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode): def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
output = (2 * padding + img_size - filter_size) / float(stride) output = (2 * padding + img_size - filter_size) / float(stride)
if caffe_mode: if caffe_mode:
...@@ -1074,20 +1067,34 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode): ...@@ -1074,20 +1067,34 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
return 1 + int(math.ceil(output)) return 1 + int(math.ceil(output))
''' #calcualte image_size based on output_size for de-convolution (ConvTransLayer).
calcualte image_size based on output_size for convolution. #It is the reverse function of cnn_output_size
It is the reverse function of cnn_output_size
'''
def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode): def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
if caffe_mode:
img_size = (output_size - 1) * stride + filter_size - 2 * padding img_size = (output_size - 1) * stride + filter_size - 2 * padding
else: if not caffe_mode:
img_size = (output_size - 2) * stride + filter_size - 2 * padding + 1 img_size = img_size + 1
return img_size return img_size
def get_img_size(input_layer_name, channels):
input = g_layer_map[input_layer_name]
img_pixels = input.size / channels
img_size = input.width if input.width > 0 else int(img_pixels**0.5)
img_size_y = input.height if input.height > 0 else int(img_pixels /
img_size)
config_assert(
img_size * img_size_y == img_pixels,
"Input layer %s: Incorrect input image size %d * %d for input image pixels %d"
% (input_layer_name, img_size, img_size_y, img_pixels))
return img_size, img_size_y
def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
parse_image(bilinear, input_layer_name, bilinear_conf.image_conf)
bilinear_conf.out_size_x = bilinear.out_size_x
bilinear_conf.out_size_y = bilinear.out_size_y
def parse_pool(pool, input_layer_name, pool_conf): def parse_pool(pool, input_layer_name, pool_conf):
pool_conf.pool_type = pool.pool_type pool_conf.pool_type = pool.pool_type
config_assert(pool.pool_type in [ config_assert(pool.pool_type in [
...@@ -1103,14 +1110,8 @@ def parse_pool(pool, input_layer_name, pool_conf): ...@@ -1103,14 +1110,8 @@ def parse_pool(pool, input_layer_name, pool_conf):
pool_conf.size_y = default(pool.size_y, pool_conf.size_x) pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
pool_conf.stride_y = default(pool.stride_y, pool_conf.stride) pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)
img_pixels = g_layer_map[input_layer_name].size / pool.channels pool_conf.img_size, pool_conf.img_size_y = \
# the img_width may be removed, get_img_size(input_layer_name, pool.channels)
# and it can be calculated automatically later.
pool_conf.img_size = default(pool.img_width, int(img_pixels**0.5))
pool_conf.img_size_y = img_pixels / pool_conf.img_size
config_assert(pool_conf.img_size * pool_conf.img_size_y == img_pixels,
"Incorrect input image size %d for input image pixels %d" %
(pool_conf.img_size, img_pixels))
config_assert(not pool.start, "start is deprecated in pooling.") config_assert(not pool.start, "start is deprecated in pooling.")
...@@ -1126,29 +1127,18 @@ def parse_pool(pool, input_layer_name, pool_conf): ...@@ -1126,29 +1127,18 @@ def parse_pool(pool, input_layer_name, pool_conf):
def parse_spp(spp, input_layer_name, spp_conf): def parse_spp(spp, input_layer_name, spp_conf):
parse_image(spp, input_layer_name, spp_conf.image_conf)
spp_conf.pool_type = spp.pool_type spp_conf.pool_type = spp.pool_type
config_assert(spp.pool_type in ['max-projection', 'avg-projection'], config_assert(spp.pool_type in ['max-projection', 'avg-projection'],
"pool-type %s is not in " "pool-type %s is not in "
"['max-projection', 'avg-projection']" % spp.pool_type) "['max-projection', 'avg-projection']" % spp.pool_type)
spp_conf.pyramid_height = spp.pyramid_height spp_conf.pyramid_height = spp.pyramid_height
spp_conf.channels = spp.channels
img_pixels = g_layer_map[input_layer_name].size / spp_conf.channels
spp_conf.img_size = default(spp.img_width, int(img_pixels**0.5))
spp_conf.img_size_y = img_pixels / spp_conf.img_size
config_assert(spp_conf.img_size * spp_conf.img_size_y == img_pixels,
"Incorrect input image size %d for input image pixels %d" %
(spp_conf.img_size, img_pixels))
def parse_image(image, input_layer_name, image_conf): def parse_image(image, input_layer_name, image_conf):
image_conf.channels = image.channels image_conf.channels = image.channels
image_pixels = g_layer_map[input_layer_name].size / image_conf.channels image_conf.img_size, image_conf.img_size_y = \
image_conf.img_size = int(image_pixels**0.5) get_img_size(input_layer_name, image_conf.channels)
config_assert((image_conf.img_size**2) == image_pixels,
"Incorrect input image size %d for input image pixels %d" %
(image_conf.img_size, image_pixels))
def parse_norm(norm, input_layer_name, norm_conf): def parse_norm(norm, input_layer_name, norm_conf):
...@@ -1162,24 +1152,18 @@ def parse_norm(norm, input_layer_name, norm_conf): ...@@ -1162,24 +1152,18 @@ def parse_norm(norm, input_layer_name, norm_conf):
norm_conf.pow = norm.pow norm_conf.pow = norm.pow
norm_conf.blocked = norm.blocked norm_conf.blocked = norm.blocked
img_pixels = g_layer_map[input_layer_name].size / norm.channels norm_conf.img_size, norm_conf.img_size_y = \
norm_conf.img_size = int(img_pixels**0.5) get_img_size(input_layer_name, norm.channels)
config_assert((norm_conf.img_size**2) == img_pixels,
"Incorrect input image size %d for input image pixels %d" %
(norm_conf.img_size, img_pixels))
norm_conf.output_x = norm_conf.img_size norm_conf.output_x = norm_conf.img_size
norm_conf.output_y = norm_conf.img_size_y
if norm.norm_type in ['cmrnorm-projection']: if norm.norm_type in ['cmrnorm-projection']:
norm_conf.scale /= norm.size norm_conf.scale /= norm.size
else: else:
norm_conf.scale /= norm.size**2 norm_conf.scale /= norm.size**2
''' #caffe_mode: compute the output size using floor instead of ceil,
caffe_mode: compute the output size using floor instead of ceil, # which is consistent of caffe and CuDNN's convention.
which is consistent of caffe and CuDNN's convention.
'''
def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False): def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
conv_conf.filter_size = conv.filter_size conv_conf.filter_size = conv.filter_size
conv_conf.filter_size_y = conv.filter_size_y conv_conf.filter_size_y = conv.filter_size_y
...@@ -1193,33 +1177,24 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False): ...@@ -1193,33 +1177,24 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
if not trans: if not trans:
conv_conf.filter_channels = conv.channels / conv.groups conv_conf.filter_channels = conv.channels / conv.groups
conv_conf.img_size, conv_conf.img_size_y = \
img_pixels = g_layer_map[input_layer_name].size / conv.channels get_img_size(input_layer_name, conv.channels)
print('channels=%d size=%d' % (conv.channels,
g_layer_map[input_layer_name].size))
conv_conf.img_size = int(img_pixels**0.5)
config_assert((conv_conf.img_size**2) == img_pixels, (
"Input layer %s: Incorrect input image size %d for input " +
"image pixels %d") %
(input_layer_name, conv_conf.img_size, img_pixels))
conv_conf.output_x = cnn_output_size( conv_conf.output_x = cnn_output_size(
conv_conf.img_size, conv_conf.filter_size, conv_conf.padding, conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
conv_conf.stride, conv_conf.caffe_mode) conv_conf.stride, conv_conf.caffe_mode)
conv_conf.output_y = cnn_output_size(
conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
conv_conf.stride_y, conv_conf.caffe_mode)
else: else:
conv_conf.filter_channels = num_filters / conv.groups conv_conf.filter_channels = num_filters / conv.groups
conv_conf.output_x, conv_conf.output_y = \
outputSize = g_layer_map[input_layer_name].size / conv.channels get_img_size(input_layer_name, conv.channels)
print('channels=%d size=%d' % (conv.channels,
g_layer_map[input_layer_name].size))
conv_conf.output_x = int(outputSize**0.5)
config_assert((conv_conf.output_x**2) == outputSize, (
"Input layer %s: Incorrect input image size %d for input " +
"image pixels %d") %
(input_layer_name, conv_conf.output_x, outputSize))
conv_conf.img_size = cnn_image_size( conv_conf.img_size = cnn_image_size(
conv_conf.output_x, conv_conf.filter_size, conv_conf.padding, conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
conv_conf.stride, conv_conf.caffe_mode) conv_conf.stride, conv_conf.caffe_mode)
conv_conf.img_size_y = cnn_image_size(
conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
conv_conf.stride_y, conv_conf.caffe_mode)
def parse_block_expand(block_expand, input_layer_name, block_expand_conf): def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
...@@ -1248,10 +1223,8 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf): ...@@ -1248,10 +1223,8 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
def parse_maxout(maxout, input_layer_name, maxout_conf): def parse_maxout(maxout, input_layer_name, maxout_conf):
maxout_conf.channels = maxout.channels parse_image(maxout, input_layer_name, maxout_conf.image_conf)
maxout_conf.groups = maxout.groups maxout_conf.groups = maxout.groups
maxout_conf.img_size_x = maxout.img_size_x
maxout_conf.img_size_y = maxout.img_size_y
# Define an evaluator # Define an evaluator
...@@ -1378,6 +1351,12 @@ class LayerBase(object): ...@@ -1378,6 +1351,12 @@ class LayerBase(object):
g_current_submodel.layer_names.append(self.config.name) g_current_submodel.layer_names.append(self.config.name)
if self.config.type != 'data' and g_pass_height_width:
height = self.get_input_layer(0).height
width = self.get_input_layer(0).width
if height and width:
self.set_layer_height_width(height, width)
def get_input_layer(self, input_index): def get_input_layer(self, input_index):
return g_layer_map[self.config.inputs[input_index].input_layer_name] return g_layer_map[self.config.inputs[input_index].input_layer_name]
...@@ -1495,6 +1474,23 @@ class LayerBase(object): ...@@ -1495,6 +1474,23 @@ class LayerBase(object):
'Different inputs result in' + 'Different inputs result in' +
'different layer size at layer %s' % self.config.name) 'different layer size at layer %s' % self.config.name)
def set_layer_height_width(self, height, width):
self.config.height = height
self.config.width = width
def set_cnn_layer(self,
input_layer_name,
height,
width,
channels,
is_print=True):
size = height * width * channels
self.set_layer_size(size)
self.set_layer_height_width(height, width)
if is_print:
print("output for %s: c = %d, h = %d, w = %d, size = %d" %
(input_layer_name, channels, height, width, size))
@config_layer('multi_class_cross_entropy_with_selfnorm') @config_layer('multi_class_cross_entropy_with_selfnorm')
class MultiClassCrossEntropySelfNormCostLayer(LayerBase): class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
...@@ -1584,9 +1580,11 @@ class PrintLayer(LayerBase): ...@@ -1584,9 +1580,11 @@ class PrintLayer(LayerBase):
@config_layer('data') @config_layer('data')
class DataLayer(LayerBase): class DataLayer(LayerBase):
def __init__(self, name, size, device=None): def __init__(self, name, size, height=None, width=None, device=None):
super(DataLayer, self).__init__( super(DataLayer, self).__init__(
name, 'data', size, inputs=[], device=device) name, 'data', size, inputs=[], device=device)
if height and width:
self.set_layer_height_width(height, width)
''' '''
...@@ -1685,14 +1683,13 @@ class ConvLayerBase(LayerBase): ...@@ -1685,14 +1683,13 @@ class ConvLayerBase(LayerBase):
for input_index in xrange(len(self.inputs)): for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index) input_layer = self.get_input_layer(input_index)
parse_conv(self.inputs[input_index].conv, input_layer.name,
self.config.inputs[input_index].conv_conf, num_filters)
conv_conf = self.config.inputs[input_index].conv_conf conv_conf = self.config.inputs[input_index].conv_conf
parse_conv(self.inputs[input_index].conv, input_layer.name,
conv_conf, num_filters)
psize = self.calc_parameter_size(conv_conf) psize = self.calc_parameter_size(conv_conf)
print("output size for %s is %d " % (name, conv_conf.output_x))
self.create_input_parameter(input_index, psize) self.create_input_parameter(input_index, psize)
self.set_layer_size( self.set_cnn_layer(name, conv_conf.output_y, conv_conf.output_x,
(conv_conf.output_x**2) * self.config.num_filters) self.config.num_filters)
psize = self.config.size psize = self.config.size
if shared_biases: if shared_biases:
...@@ -1779,10 +1776,11 @@ class NormLayer(LayerBase): ...@@ -1779,10 +1776,11 @@ class NormLayer(LayerBase):
name, 'norm', 0, inputs=inputs, device=device) name, 'norm', 0, inputs=inputs, device=device)
for input_index in xrange(len(self.inputs)): for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index) input_layer = self.get_input_layer(input_index)
parse_norm(self.inputs[input_index].norm, input_layer.name,
self.config.inputs[input_index].norm_conf)
norm_conf = self.config.inputs[input_index].norm_conf norm_conf = self.config.inputs[input_index].norm_conf
self.set_layer_size((norm_conf.output_x**2) * norm_conf.channels) parse_norm(self.inputs[input_index].norm, input_layer.name,
norm_conf)
self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
norm_conf.channels, False)
@config_layer('pool') @config_layer('pool')
...@@ -1792,13 +1790,11 @@ class PoolLayer(LayerBase): ...@@ -1792,13 +1790,11 @@ class PoolLayer(LayerBase):
name, 'pool', 0, inputs=inputs, device=device) name, 'pool', 0, inputs=inputs, device=device)
for input_index in xrange(len(self.inputs)): for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index) input_layer = self.get_input_layer(input_index)
parse_pool(self.inputs[input_index].pool, input_layer.name,
self.config.inputs[input_index].pool_conf)
pool_conf = self.config.inputs[input_index].pool_conf pool_conf = self.config.inputs[input_index].pool_conf
print("output size for %s is %d*%d " % (name, pool_conf.output_y, parse_pool(self.inputs[input_index].pool, input_layer.name,
pool_conf.output_x)) pool_conf)
self.set_layer_size( self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
(pool_conf.output_x * pool_conf.output_y) * pool_conf.channels) pool_conf.channels)
@config_layer('spp') @config_layer('spp')
...@@ -1808,12 +1804,10 @@ class SpatialPyramidPoolLayer(LayerBase): ...@@ -1808,12 +1804,10 @@ class SpatialPyramidPoolLayer(LayerBase):
name, 'spp', 0, inputs=inputs, device=device) name, 'spp', 0, inputs=inputs, device=device)
for input_index in xrange(len(self.inputs)): for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index) input_layer = self.get_input_layer(input_index)
parse_spp(self.inputs[input_index].spp, input_layer.name,
self.config.inputs[input_index].spp_conf)
spp_conf = self.config.inputs[input_index].spp_conf spp_conf = self.config.inputs[input_index].spp_conf
output_size = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1) parse_spp(self.inputs[input_index].spp, input_layer.name, spp_conf)
print("output size for %s is %d " % (name, output_size)) output_x = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1)
self.set_layer_size(output_size * spp_conf.channels) self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
@config_layer('batch_norm') @config_layer('batch_norm')
...@@ -1875,10 +1869,10 @@ class BatchNormLayer(LayerBase): ...@@ -1875,10 +1869,10 @@ class BatchNormLayer(LayerBase):
self.config.moving_average_fraction = moving_average_fraction self.config.moving_average_fraction = moving_average_fraction
input_layer = self.get_input_layer(0) input_layer = self.get_input_layer(0)
parse_image(self.inputs[0].image, input_layer.name,
self.config.inputs[0].image_conf)
image_conf = self.config.inputs[0].image_conf image_conf = self.config.inputs[0].image_conf
self.set_layer_size((image_conf.img_size**2) * image_conf.channels) parse_image(self.inputs[0].image, input_layer.name, image_conf)
self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
image_conf.channels)
psize = self.calc_parameter_size(image_conf) psize = self.calc_parameter_size(image_conf)
dims = [1, psize] dims = [1, psize]
...@@ -1936,11 +1930,11 @@ class MaxOutLayer(LayerBase): ...@@ -1936,11 +1930,11 @@ class MaxOutLayer(LayerBase):
super(MaxOutLayer, self).__init__( super(MaxOutLayer, self).__init__(
name, 'maxout', 0, inputs=inputs, **xargs) name, 'maxout', 0, inputs=inputs, **xargs)
input_layer = self.get_input_layer(0) input_layer = self.get_input_layer(0)
parse_maxout(self.inputs[0].maxout, input_layer.name,
self.config.inputs[0].maxout_conf)
maxout_conf = self.config.inputs[0].maxout_conf maxout_conf = self.config.inputs[0].maxout_conf
self.set_layer_size(g_layer_map[input_layer.name].size / parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf)
maxout_conf.groups) out_channels = maxout_conf.image_conf.channels / maxout_conf.groups
self.set_cnn_layer(name, g_layer_map[input_layer.name].height,
g_layer_map[input_layer.name].width, out_channels)
# key: cost type # key: cost type
...@@ -2520,11 +2514,10 @@ class BilinearInterpLayer(LayerBase): ...@@ -2520,11 +2514,10 @@ class BilinearInterpLayer(LayerBase):
super(BilinearInterpLayer, self).__init__( super(BilinearInterpLayer, self).__init__(
name, 'bilinear_interp', 0, inputs=inputs, **xargs) name, 'bilinear_interp', 0, inputs=inputs, **xargs)
input_layer = self.get_input_layer(0) input_layer = self.get_input_layer(0)
parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name, conf = self.config.inputs[0].bilinear_interp_conf
self.config.inputs[0].bilinear_interp_conf) parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name, conf)
conf = self.inputs[0].bilinear_interp self.set_cnn_layer(name, conf.out_size_y, conf.out_size_x,
self.set_layer_size(conf.out_size_x * conf.out_size_y * conf.image_conf.channels)
conf.num_channels)
@config_layer('sum_to_one_norm') @config_layer('sum_to_one_norm')
...@@ -2997,6 +2990,8 @@ class CTCLayer(LayerBase): ...@@ -2997,6 +2990,8 @@ class CTCLayer(LayerBase):
@config_layer('recurrent_layer_group') @config_layer('recurrent_layer_group')
class RecurrentLayerGroup(LayerBase): class RecurrentLayerGroup(LayerBase):
def __init__(self, name, device=None): def __init__(self, name, device=None):
global g_pass_height_width
g_pass_height_width = False
super(RecurrentLayerGroup, self).__init__( super(RecurrentLayerGroup, self).__init__(
name, 'recurrent_layer_group', 0, inputs=[], device=device) name, 'recurrent_layer_group', 0, inputs=[], device=device)
......
...@@ -766,7 +766,7 @@ def mixed_layer(size=0, ...@@ -766,7 +766,7 @@ def mixed_layer(size=0,
@layer_support() @layer_support()
def data_layer(name, size, layer_attr=None): def data_layer(name, size, height=None, width=None, layer_attr=None):
""" """
Define DataLayer For NeuralNetwork. Define DataLayer For NeuralNetwork.
...@@ -781,6 +781,10 @@ def data_layer(name, size, layer_attr=None): ...@@ -781,6 +781,10 @@ def data_layer(name, size, layer_attr=None):
:type name: basestring :type name: basestring
:param size: Size of this data layer. :param size: Size of this data layer.
:type size: int :type size: int
:param height: Height of this data layer, used for image
:type size: int|None
:param width: Width of this data layer, used for image
:type size: int|None
:param layer_attr: Extra Layer Attribute. :param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:return: LayerOutput object. :return: LayerOutput object.
...@@ -790,6 +794,8 @@ def data_layer(name, size, layer_attr=None): ...@@ -790,6 +794,8 @@ def data_layer(name, size, layer_attr=None):
type=LayerType.DATA, type=LayerType.DATA,
name=name, name=name,
size=size, size=size,
height=height,
width=width,
**ExtraLayerAttribute.to_kwargs(layer_attr)) **ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(name, LayerType.DATA, size=size) return LayerOutput(name, LayerType.DATA, size=size)
...@@ -1483,7 +1489,7 @@ def bilinear_interp_layer(input, ...@@ -1483,7 +1489,7 @@ def bilinear_interp_layer(input,
bilinear_interp=BilinearInterp( bilinear_interp=BilinearInterp(
out_size_x=out_size_x, out_size_x=out_size_x,
out_size_y=out_size_y, out_size_y=out_size_y,
num_channels=num_channels)), channels=num_channels)),
type=LayerType.BILINEAR_INTERP_LAYER, type=LayerType.BILINEAR_INTERP_LAYER,
**ExtraLayerAttribute.to_kwargs(layer_attr)) **ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput( return LayerOutput(
...@@ -1923,8 +1929,7 @@ def img_pool_layer(input, ...@@ -1923,8 +1929,7 @@ def img_pool_layer(input,
layer_attr=None, layer_attr=None,
pool_size_y=None, pool_size_y=None,
stride_y=None, stride_y=None,
padding_y=None, padding_y=None):
img_width=None):
""" """
Image pooling Layer. Image pooling Layer.
...@@ -1955,9 +1960,6 @@ def img_pool_layer(input, ...@@ -1955,9 +1960,6 @@ def img_pool_layer(input,
:type stride_y: int|None :type stride_y: int|None
:param layer_attr: Extra Layer attribute. :param layer_attr: Extra Layer attribute.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:param img_width: the width of input feature map. If it is None, the input feature
map should be square.
:type img_width: int|None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -1993,8 +1995,7 @@ def img_pool_layer(input, ...@@ -1993,8 +1995,7 @@ def img_pool_layer(input,
padding=padding, padding=padding,
size_y=pool_size_y, size_y=pool_size_y,
stride_y=stride_y, stride_y=stride_y,
padding_y=padding_y, padding_y=padding_y))
img_width=img_width))
], ],
**ExtraLayerAttribute.to_kwargs(layer_attr)) **ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput( return LayerOutput(
...@@ -2012,7 +2013,6 @@ def spp_layer(input, ...@@ -2012,7 +2013,6 @@ def spp_layer(input,
num_channels=None, num_channels=None,
pool_type=None, pool_type=None,
pyramid_height=None, pyramid_height=None,
img_width=None,
layer_attr=None): layer_attr=None):
""" """
Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition. Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition.
...@@ -2029,9 +2029,6 @@ def spp_layer(input, ...@@ -2029,9 +2029,6 @@ def spp_layer(input,
:type scale: BasePoolingType :type scale: BasePoolingType
:param pyramid_height: pyramid height. :param pyramid_height: pyramid height.
:type pyramid_height: int :type pyramid_height: int
:param img_width: the width of input feature map. If it is None, the input feature
map should be square.
:type img_width: int|None
:param layer_attr: Extra Layer Attribute. :param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
...@@ -2058,8 +2055,7 @@ def spp_layer(input, ...@@ -2058,8 +2055,7 @@ def spp_layer(input,
spp=SpatialPyramidPool( spp=SpatialPyramidPool(
pool_type=type_name, pool_type=type_name,
channels=num_channels, channels=num_channels,
pyramid_height=pyramid_height, pyramid_height=pyramid_height)),
img_width=img_width)),
**ExtraLayerAttribute.to_kwargs(layer_attr)) **ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput( return LayerOutput(
name, name,
......
...@@ -26,11 +26,15 @@ layers { ...@@ -26,11 +26,15 @@ layers {
filter_size_y: 32 filter_size_y: 32
padding_y: 1 padding_y: 1
stride_y: 1 stride_y: 1
output_y: 227
img_size_y: 256
} }
} }
bias_parameter_name: "___conv_0__.wbias" bias_parameter_name: "___conv_0__.wbias"
num_filters: 64 num_filters: 64
shared_biases: true shared_biases: true
height: 227
width: 227
} }
layers { layers {
name: "__batch_norm_0__" name: "__batch_norm_0__"
...@@ -43,6 +47,7 @@ layers { ...@@ -43,6 +47,7 @@ layers {
image_conf { image_conf {
channels: 64 channels: 64
img_size: 227 img_size: 227
img_size_y: 227
} }
} }
inputs { inputs {
...@@ -55,6 +60,8 @@ layers { ...@@ -55,6 +60,8 @@ layers {
} }
bias_parameter_name: "___batch_norm_0__.wbias" bias_parameter_name: "___batch_norm_0__.wbias"
moving_average_fraction: 0.9 moving_average_fraction: 0.9
height: 227
width: 227
} }
layers { layers {
name: "__crmnorm_0__" name: "__crmnorm_0__"
...@@ -72,8 +79,12 @@ layers { ...@@ -72,8 +79,12 @@ layers {
output_x: 227 output_x: 227
img_size: 227 img_size: 227
blocked: false blocked: false
output_y: 227
img_size_y: 227
} }
} }
height: 227
width: 227
} }
layers { layers {
name: "__pool_0__" name: "__pool_0__"
...@@ -97,6 +108,8 @@ layers { ...@@ -97,6 +108,8 @@ layers {
padding_y: 0 padding_y: 0
} }
} }
height: 196
width: 196
} }
parameters { parameters {
name: "___conv_0__.w0" name: "___conv_0__.w0"
......
...@@ -26,6 +26,8 @@ layers { ...@@ -26,6 +26,8 @@ layers {
filter_size_y: 32 filter_size_y: 32
padding_y: 1 padding_y: 1
stride_y: 1 stride_y: 1
output_y: 227
img_size_y: 256
} }
} }
bias_parameter_name: "___conv_0__.wbias" bias_parameter_name: "___conv_0__.wbias"
...@@ -43,6 +45,7 @@ layers { ...@@ -43,6 +45,7 @@ layers {
image_conf { image_conf {
channels: 64 channels: 64
img_size: 256 img_size: 256
img_size_y: 256
} }
} }
inputs { inputs {
...@@ -55,6 +58,8 @@ layers { ...@@ -55,6 +58,8 @@ layers {
} }
bias_parameter_name: "___batch_norm_0__.wbias" bias_parameter_name: "___batch_norm_0__.wbias"
moving_average_fraction: 0.9 moving_average_fraction: 0.9
height: 256
width: 256
} }
layers { layers {
name: "__crmnorm_0__" name: "__crmnorm_0__"
...@@ -72,8 +77,12 @@ layers { ...@@ -72,8 +77,12 @@ layers {
output_x: 256 output_x: 256
img_size: 256 img_size: 256
blocked: false blocked: false
output_y: 256
img_size_y: 256
} }
} }
height: 256
width: 256
} }
layers { layers {
name: "__pool_0__" name: "__pool_0__"
...@@ -97,6 +106,8 @@ layers { ...@@ -97,6 +106,8 @@ layers {
padding_y: 0 padding_y: 0
} }
} }
height: 225
width: 225
} }
parameters { parameters {
name: "___conv_0__.w0" name: "___conv_0__.w0"
......
...@@ -177,6 +177,8 @@ layers { ...@@ -177,6 +177,8 @@ layers {
filter_size_y: 3 filter_size_y: 3
padding_y: 0 padding_y: 0
stride_y: 1 stride_y: 1
output_y: 30
img_size_y: 32
} }
num_filters: 64 num_filters: 64
} }
......
...@@ -26,11 +26,15 @@ layers { ...@@ -26,11 +26,15 @@ layers {
filter_size_y: 3 filter_size_y: 3
padding_y: 1 padding_y: 1
stride_y: 1 stride_y: 1
output_y: 48
img_size_y: 48
} }
} }
bias_parameter_name: "___conv_0__.wbias" bias_parameter_name: "___conv_0__.wbias"
num_filters: 16 num_filters: 16
shared_biases: true shared_biases: true
height: 48
width: 48
} }
layers { layers {
name: "__bilinear_interp_layer_0__" name: "__bilinear_interp_layer_0__"
...@@ -40,11 +44,17 @@ layers { ...@@ -40,11 +44,17 @@ layers {
inputs { inputs {
input_layer_name: "__conv_0__" input_layer_name: "__conv_0__"
bilinear_interp_conf { bilinear_interp_conf {
image_conf {
channels: 16
img_size: 48
img_size_y: 48
}
out_size_x: 64 out_size_x: 64
out_size_y: 64 out_size_y: 64
num_channels: 16
} }
} }
height: 64
width: 64
} }
layers { layers {
name: "__pool_0__" name: "__pool_0__"
...@@ -55,19 +65,21 @@ layers { ...@@ -55,19 +65,21 @@ layers {
input_layer_name: "__bilinear_interp_layer_0__" input_layer_name: "__bilinear_interp_layer_0__"
pool_conf { pool_conf {
pool_type: "max-projection" pool_type: "max-projection"
channels: 4 channels: 16
size_x: 2 size_x: 2
stride: 2 stride: 2
output_x: 64 output_x: 32
img_size: 128 img_size: 64
padding: 0 padding: 0
size_y: 2 size_y: 2
stride_y: 2 stride_y: 2
output_y: 64 output_y: 32
img_size_y: 128 img_size_y: 64
padding_y: 0 padding_y: 0
} }
} }
height: 32
width: 32
} }
layers { layers {
name: "__fc_layer_0__" name: "__fc_layer_0__"
...@@ -78,6 +90,8 @@ layers { ...@@ -78,6 +90,8 @@ layers {
input_layer_name: "__pool_0__" input_layer_name: "__pool_0__"
input_parameter_name: "___fc_layer_0__.w0" input_parameter_name: "___fc_layer_0__.w0"
} }
height: 32
width: 32
} }
parameters { parameters {
name: "___conv_0__.w0" name: "___conv_0__.w0"
......
...@@ -4,6 +4,8 @@ layers { ...@@ -4,6 +4,8 @@ layers {
type: "data" type: "data"
size: 2304 size: 2304
active_type: "" active_type: ""
height: 48
width: 48
} }
layers { layers {
name: "__conv_0__" name: "__conv_0__"
...@@ -26,11 +28,15 @@ layers { ...@@ -26,11 +28,15 @@ layers {
filter_size_y: 3 filter_size_y: 3
padding_y: 1 padding_y: 1
stride_y: 1 stride_y: 1
output_y: 48
img_size_y: 48
} }
} }
bias_parameter_name: "___conv_0__.wbias" bias_parameter_name: "___conv_0__.wbias"
num_filters: 16 num_filters: 16
shared_biases: true shared_biases: true
height: 48
width: 48
} }
layers { layers {
name: "__maxout_layer_0__" name: "__maxout_layer_0__"
...@@ -40,12 +46,16 @@ layers { ...@@ -40,12 +46,16 @@ layers {
inputs { inputs {
input_layer_name: "__conv_0__" input_layer_name: "__conv_0__"
maxout_conf { maxout_conf {
image_conf {
channels: 16 channels: 16
img_size: 48
img_size_y: 48
}
groups: 2 groups: 2
img_size_x: 0
img_size_y: 0
} }
} }
height: 48
width: 48
} }
layers { layers {
name: "__pool_0__" name: "__pool_0__"
...@@ -69,48 +79,58 @@ layers { ...@@ -69,48 +79,58 @@ layers {
padding_y: 0 padding_y: 0
} }
} }
height: 24
width: 24
} }
layers { layers {
name: "__conv_1__" name: "__conv_1__"
type: "exconv" type: "exconv"
size: 18432 size: 73728
active_type: "" active_type: ""
inputs { inputs {
input_layer_name: "__pool_0__" input_layer_name: "__pool_0__"
input_parameter_name: "___conv_1__.w0" input_parameter_name: "___conv_1__.w0"
conv_conf { conv_conf {
filter_size: 3 filter_size: 3
channels: 32 channels: 8
stride: 1 stride: 1
padding: 1 padding: 1
groups: 1 groups: 1
filter_channels: 32 filter_channels: 8
output_x: 12 output_x: 24
img_size: 12 img_size: 24
caffe_mode: true caffe_mode: true
filter_size_y: 3 filter_size_y: 3
padding_y: 1 padding_y: 1
stride_y: 1 stride_y: 1
output_y: 24
img_size_y: 24
} }
} }
bias_parameter_name: "___conv_1__.wbias" bias_parameter_name: "___conv_1__.wbias"
num_filters: 128 num_filters: 128
shared_biases: true shared_biases: true
height: 24
width: 24
} }
layers { layers {
name: "__maxout_layer_1__" name: "__maxout_layer_1__"
type: "maxout" type: "maxout"
size: 9216 size: 18432
active_type: "" active_type: ""
inputs { inputs {
input_layer_name: "__conv_0__" input_layer_name: "__conv_1__"
maxout_conf { maxout_conf {
image_conf {
channels: 128 channels: 128
img_size: 24
img_size_y: 24
}
groups: 4 groups: 4
img_size_x: 0
img_size_y: 0
} }
} }
height: 24
width: 24
} }
layers { layers {
name: "__block_expand_layer_0__" name: "__block_expand_layer_0__"
...@@ -118,7 +138,7 @@ layers { ...@@ -118,7 +138,7 @@ layers {
size: 192 size: 192
active_type: "" active_type: ""
inputs { inputs {
input_layer_name: "__maxout_layer_0__" input_layer_name: "__maxout_layer_1__"
block_expand_conf { block_expand_conf {
channels: 32 channels: 32
stride_x: 1 stride_x: 1
...@@ -133,6 +153,8 @@ layers { ...@@ -133,6 +153,8 @@ layers {
img_size_y: 0 img_size_y: 0
} }
} }
height: 24
width: 24
} }
layers { layers {
name: "__fc_layer_0__" name: "__fc_layer_0__"
...@@ -143,6 +165,8 @@ layers { ...@@ -143,6 +165,8 @@ layers {
input_layer_name: "__block_expand_layer_0__" input_layer_name: "__block_expand_layer_0__"
input_parameter_name: "___fc_layer_0__.w0" input_parameter_name: "___fc_layer_0__.w0"
} }
height: 24
width: 24
} }
parameters { parameters {
name: "___conv_0__.w0" name: "___conv_0__.w0"
...@@ -164,9 +188,9 @@ parameters { ...@@ -164,9 +188,9 @@ parameters {
} }
parameters { parameters {
name: "___conv_1__.w0" name: "___conv_1__.w0"
size: 36864 size: 9216
initial_mean: 0.0 initial_mean: 0.0
initial_std: 0.0833333333333 initial_std: 0.166666666667
initial_strategy: 0 initial_strategy: 0
initial_smart: false initial_smart: false
} }
......
...@@ -4,6 +4,8 @@ layers { ...@@ -4,6 +4,8 @@ layers {
type: "data" type: "data"
size: 3200 size: 3200
active_type: "" active_type: ""
height: 20
width: 10
} }
layers { layers {
name: "__spp_0__" name: "__spp_0__"
...@@ -13,13 +15,17 @@ layers { ...@@ -13,13 +15,17 @@ layers {
inputs { inputs {
input_layer_name: "data" input_layer_name: "data"
spp_conf { spp_conf {
pool_type: "max-projection" image_conf {
pyramid_height: 2
channels: 16 channels: 16
img_size: 10 img_size: 10
img_size_y: 20 img_size_y: 20
} }
pool_type: "max-projection"
pyramid_height: 2
}
} }
height: 1
width: 5
} }
input_layer_names: "data" input_layer_names: "data"
output_layer_names: "__spp_0__" output_layer_names: "__spp_0__"
......
...@@ -17,7 +17,7 @@ bilinear = bilinear_interp_layer(input=conv, out_size_x=64, out_size_y=64) ...@@ -17,7 +17,7 @@ bilinear = bilinear_interp_layer(input=conv, out_size_x=64, out_size_y=64)
pool = img_pool_layer( pool = img_pool_layer(
input=bilinear, input=bilinear,
num_channels=4, num_channels=16,
pool_size=2, pool_size=2,
stride=2, stride=2,
pool_type=MaxPooling()) pool_type=MaxPooling())
......
...@@ -2,7 +2,7 @@ from paddle.trainer_config_helpers import * ...@@ -2,7 +2,7 @@ from paddle.trainer_config_helpers import *
settings(batch_size=1000, learning_rate=1e-5) settings(batch_size=1000, learning_rate=1e-5)
data = data_layer(name='data', size=2304) data = data_layer(name='data', size=2304, height=48, width=48)
conv = img_conv_layer( conv = img_conv_layer(
input=data, input=data,
...@@ -21,16 +21,21 @@ pool = img_pool_layer( ...@@ -21,16 +21,21 @@ pool = img_pool_layer(
conv2 = img_conv_layer( conv2 = img_conv_layer(
input=pool, input=pool,
filter_size=3, filter_size=3,
num_channels=32, num_channels=8,
num_filters=128, num_filters=128,
padding=1, padding=1,
act=LinearActivation(), act=LinearActivation(),
bias_attr=True) bias_attr=True)
maxout2 = maxout_layer(input=conv, num_channels=128, groups=4) maxout2 = maxout_layer(input=conv2, num_channels=128, groups=4)
block = block_expand_layer( block = block_expand_layer(
input=maxout, num_channels=32, stride_x=1, stride_y=1, block_x=1, block_y=6) input=maxout2,
num_channels=32,
stride_x=1,
stride_y=1,
block_x=1,
block_y=6)
fc = fc_layer(input=block, size=384, bias_attr=False) fc = fc_layer(input=block, size=384, bias_attr=False)
......
...@@ -2,13 +2,9 @@ from paddle.trainer_config_helpers import * ...@@ -2,13 +2,9 @@ from paddle.trainer_config_helpers import *
settings(batch_size=100, learning_rate=1e-5) settings(batch_size=100, learning_rate=1e-5)
data = data_layer(name='data', size=3200) data = data_layer(name='data', size=3200, height=20, width=10)
spp = spp_layer( spp = spp_layer(
input=data, input=data, pyramid_height=2, num_channels=16, pool_type=MaxPooling())
pyramid_height=2,
num_channels=16,
pool_type=MaxPooling(),
img_width=10)
outputs(spp) outputs(spp)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册