From 496d64ebdb3f0c81f4684b9f48fdef90f9d94547 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Wed, 16 Nov 2016 17:53:31 +0800 Subject: [PATCH] Support rectangle input for CNN --- paddle/gserver/layers/BatchNormBaseLayer.cpp | 13 +- paddle/gserver/layers/BatchNormBaseLayer.h | 5 +- paddle/gserver/layers/BilinearInterpLayer.cpp | 6 +- paddle/gserver/layers/ConvBaseLayer.cpp | 15 +- paddle/gserver/layers/ConvOperator.cpp | 13 +- paddle/gserver/layers/ConvProjection.cpp | 2 +- paddle/gserver/layers/DataLayer.cpp | 4 +- paddle/gserver/layers/ExpandConvBaseLayer.cpp | 30 +-- paddle/gserver/layers/MaxOutLayer.cpp | 6 +- paddle/gserver/layers/NormLayer.cpp | 3 + paddle/gserver/layers/NormLayer.h | 2 +- paddle/gserver/layers/NormProjectionLayer.cpp | 2 +- .../layers/SpatialPyramidPoolLayer.cpp | 13 +- paddle/gserver/tests/img_pool_a.conf | 2 - paddle/gserver/tests/test_LayerGrad.cpp | 63 +++-- paddle/parameter/Argument.cpp | 2 + paddle/trainer/tests/test_config.conf | 1 - proto/ModelConfig.proto.m4 | 40 +-- python/paddle/trainer/config_parser.py | 238 +++++++++--------- .../paddle/trainer_config_helpers/layers.py | 26 +- .../configs/protostr/img_layers.protostr | 13 + .../protostr/img_trans_layers.protostr | 11 + .../configs/protostr/projections.protostr | 2 + .../protostr/test_bilinear_interp.protostr | 26 +- .../configs/protostr/test_maxout.protostr | 56 +++-- .../configs/protostr/test_spp_layer.protostr | 12 +- .../tests/configs/test_bilinear_interp.py | 2 +- .../tests/configs/test_maxout.py | 13 +- .../tests/configs/test_spp_layer.py | 8 +- 29 files changed, 360 insertions(+), 269 deletions(-) diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp index 8052b35ec..7bf4c1fd5 100644 --- a/paddle/gserver/layers/BatchNormBaseLayer.cpp +++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp @@ -61,15 +61,10 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap, void BatchNormBaseLayer::calFeatureMapSize() { const ImageConfig& conf = config_.inputs(0).image_conf(); - if (inputLayers_[0]->getOutput().getFrameHeight() == 0 && - inputLayers_[0]->getOutput().getFrameWidth() == 0) { - imgSize_ = conf.img_size(); - imageH_ = imgSize_; - imageW_ = imgSize_; - } else { - imageH_ = inputLayers_[0]->getOutput().getFrameHeight(); - imageW_ = inputLayers_[0]->getOutput().getFrameWidth(); - } + imageH_ = inputLayers_[0]->getOutput().getFrameHeight(); + imageW_ = inputLayers_[0]->getOutput().getFrameWidth(); + if (imageH_ == 0) imageH_ = conf.img_size_y(); + if (imageW_ == 0) imageW_ = conf.img_size(); imgPixels_ = imageH_ * imageW_; getOutput().setFrameHeight(imageH_); getOutput().setFrameWidth(imageW_); diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h index 2302d1a8e..4ea493b5f 100644 --- a/paddle/gserver/layers/BatchNormBaseLayer.h +++ b/paddle/gserver/layers/BatchNormBaseLayer.h @@ -78,9 +78,8 @@ protected: MatrixPtr savedMean_; MatrixPtr savedInvVar_; - /// Height or width of input image feature, now height is equal to width. - /// imgSize is 1 if the input is fully-connected layer. - int imgSize_; + /// Height or width of input image feature. + /// Both of them are 1 if the input is fully-connected layer. int imageH_; int imageW_; /// Height * Width. diff --git a/paddle/gserver/layers/BilinearInterpLayer.cpp b/paddle/gserver/layers/BilinearInterpLayer.cpp index ac5f87be7..64d3046b5 100644 --- a/paddle/gserver/layers/BilinearInterpLayer.cpp +++ b/paddle/gserver/layers/BilinearInterpLayer.cpp @@ -26,15 +26,15 @@ size_t BilinearInterpLayer::getSize() { const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf(); if (inImgH_ == 0) { - inImgH_ = conf.img_size_y(); + inImgH_ = conf.image_conf().img_size_y(); } if (inImgW_ == 0) { - inImgW_ = conf.img_size_x(); + inImgW_ = conf.image_conf().img_size(); } outImgH_ = conf.out_size_y(); outImgW_ = conf.out_size_x(); - numChannels_ = conf.num_channels(); + numChannels_ = conf.image_conf().channels(); CHECK(outImgH_ > 0 && outImgW_ > 0); CHECK(inImgH_ > 0 && inImgW_ > 0); diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp index 6bc3b3b80..8f358a5e4 100644 --- a/paddle/gserver/layers/ConvBaseLayer.cpp +++ b/paddle/gserver/layers/ConvBaseLayer.cpp @@ -37,11 +37,13 @@ bool ConvBaseLayer::init(const LayerMap& layerMap, filterSizeY_.push_back(conf.filter_size_y()); filterPixels_.push_back(filterSize_.back() * filterSizeY_.back()); channels_.push_back(conf.channels()); - imgSizeH_.push_back(conf.img_size()); + imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y() : + conf.img_size()); imgSizeW_.push_back(conf.img_size()); groups_.push_back(conf.groups()); filterChannels_.push_back(conf.filter_channels()); - outputH_.push_back(conf.output_x()); + outputH_.push_back(conf.has_output_y() ? conf.output_y() : + conf.output_x()); outputW_.push_back(conf.output_x()); } @@ -90,11 +92,12 @@ size_t ConvBaseLayer::calOutputSize() { for (size_t i = 0; i < inputLayers_.size(); i++) { inH.push_back(inputLayers_[i]->getOutput().getFrameHeight()); inW.push_back(inputLayers_[i]->getOutput().getFrameWidth()); + const ConvConfig& conf = config_.inputs(i).conv_conf(); if (isDeconv_) { if (inH[i] == 0) - inH[i] = config_.inputs(i).conv_conf().output_x(); + inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x(); if (inW[i] == 0) - inW[i] = config_.inputs(i).conv_conf().output_x(); + inW[i] = conf.output_x(); outH.push_back( imageSize(inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_)); @@ -103,9 +106,9 @@ size_t ConvBaseLayer::calOutputSize() { caffeMode_)); } else { if (inH[i] == 0) - inH[i] = config_.inputs(i).conv_conf().img_size(); + inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); if (inW[i] == 0) - inW[i] = config_.inputs(i).conv_conf().img_size(); + inW[i] = conf.img_size(); outH.push_back( outputSize(inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_)); diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp index 2d9c892fe..7830efab1 100644 --- a/paddle/gserver/layers/ConvOperator.cpp +++ b/paddle/gserver/layers/ConvOperator.cpp @@ -93,9 +93,9 @@ private: bool caffeMode_; int inputOffset_, outputOffset_, weightOffset_; int numFilters_; - int padding_, stride_, filterSize_, channels_, imgSize_; + int padding_, stride_, filterSize_, channels_, imgSize_, imgSizeY_; int paddingY_, strideY_, filterSizeY_; - int imgPixels_, filterPixels_, filterChannels_, outputX_, outputs_; + int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_; /// Following member variables are same with CudnnConvLayer. /// There is no explanation here. @@ -144,7 +144,7 @@ void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) { void ConvOperator::reshape(int batchSize) { imageH_ = ins_[0]->getFrameHeight(); imageW_ = ins_[0]->getFrameWidth(); - if (imageH_ == 0) imageH_ = imgSize_; + if (imageH_ == 0) imageH_ = imgSizeY_; if (imageW_ == 0) imageW_ = imgSize_; outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_); outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_); @@ -176,7 +176,10 @@ void ConvOperator::computeConvSizes() { hl_create_tensor_descriptor(&inputDesc_); int outputX = outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_); + int outputY = + outputSize(imgSizeY_, filterSizeY_, paddingY_, strideY_, caffeMode_); CHECK_EQ(outputX, outputX_); + CHECK_EQ(outputY, outputY_); hl_create_tensor_descriptor(&outputDesc_); hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_, paddingY_, padding_, strideY_, stride_); @@ -208,10 +211,12 @@ void ConvOperator::getConvParams() { filterPixels_ = filterSize_ * filterSizeY_; channels_ = conf.channels(); imgSize_ = conf.img_size(); - imgPixels_ = imgSize_ * imgSize_; + imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); + imgPixels_ = imgSize_ * imgSizeY_; CHECK_EQ(conf.groups(), 1U); filterChannels_ = conf.filter_channels(); outputX_ = conf.output_x(); + outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x(); outputs_ = outputX_ * outputX_; } diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp index d1ce53fe2..161bbad4f 100644 --- a/paddle/gserver/layers/ConvProjection.cpp +++ b/paddle/gserver/layers/ConvProjection.cpp @@ -47,7 +47,7 @@ void ConvProjection::getConvParams() { filterH_ = conf.filter_size_y(); filterW_ = conf.filter_size(); - configImgH_ = conf.img_size(); + configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); configImgW_ = conf.img_size(); channels_ = conf.channels(); diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/gserver/layers/DataLayer.cpp index 79b9181e6..b83d4f44b 100644 --- a/paddle/gserver/layers/DataLayer.cpp +++ b/paddle/gserver/layers/DataLayer.cpp @@ -48,8 +48,8 @@ void DataLayer::copyDataToOutput(Argument& output) { output.ids->copyFrom(*data_.ids); } } - output.setFrameHeight(data_.getFrameHeight()); - output.setFrameWidth(data_.getFrameWidth()); + output.setFrameHeight(config_.height()); + output.setFrameWidth(config_.width()); output.cpuSequenceDims = data_.cpuSequenceDims; output.sequenceStartPositions = data_.sequenceStartPositions; output.subSequenceStartPositions = data_.subSequenceStartPositions; diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp index 0bab0ca76..953c9d784 100644 --- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp +++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp @@ -30,17 +30,19 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap, * meaning as in conv, we need to swap channels_ and numFilters here for * convTrans, and in other functions too. * */ - int channel; - int numFilters; + /* Initialize the projection */ for (auto &inputConfig : config_.inputs()) { const ConvConfig &conf = inputConfig.conv_conf(); - numFilters = isDeconv_ ? conf.channels() : numFilters_; + int numFilters = isDeconv_ ? conf.channels() : numFilters_; subM_.push_back(numFilters / conf.groups()); - subN_.push_back(conf.output_x() * conf.output_x()); - channel = isDeconv_ ? numFilters_ : conf.channels(); - subK_.push_back(channel * conf.filter_size() * conf.filter_size() / - conf.groups()); + subN_.push_back(conf.output_x() * + (conf.has_output_y() ? conf.output_y() : conf.output_x())); + int channel = isDeconv_ ? numFilters_ : conf.channels(); + subK_.push_back( + channel * conf.filter_size() * + (conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) / + conf.groups()); /* Consistent caffe mode for multiple input */ caffeMode_ = conf.caffe_mode(); } @@ -107,9 +109,9 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image, size_t startIdx, imgData, 1, imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel, false, useGpu_); expandInput_->convExpand(*imageTmp, imgSizeH_[inIdx], imgSizeW_[inIdx], - channel, filterSize_[inIdx], - filterSize_[inIdx], stride_[inIdx], stride_[inIdx], - padding_[inIdx], padding_[inIdx], + channel, filterSizeY_[inIdx], + filterSize_[inIdx], strideY_[inIdx], stride_[inIdx], + paddingY_[inIdx], padding_[inIdx], outputH_[inIdx], outputW_[inIdx]); imageTmp->clear(); } @@ -188,10 +190,10 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image, imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel, false, useGpu_); vTmp->convShrink(*oneGradTmp, imgSizeH_[inpIdx], imgSizeW_[inpIdx], - channel, filterSize_[inpIdx], - filterSize_[inpIdx], stride_[inpIdx], stride_[inpIdx], - padding_[inpIdx], padding_[inpIdx], - outputH_[inpIdx], outputW_[inpIdx], 1.0f, 1.0f); + channel, filterSizeY_[inpIdx], + filterSize_[inpIdx], strideY_[inpIdx], stride_[inpIdx], + paddingY_[inpIdx], padding_[inpIdx], outputH_[inpIdx], + outputW_[inpIdx], 1.0f, 1.0f); vTmp->clear(); oneGradTmp->clear(); diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/gserver/layers/MaxOutLayer.cpp index a3de069bf..b7f1b9804 100644 --- a/paddle/gserver/layers/MaxOutLayer.cpp +++ b/paddle/gserver/layers/MaxOutLayer.cpp @@ -25,10 +25,10 @@ size_t MaxOutLayer::getSize() { imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight(); imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth(); if (imgSizeH_ == 0) { - imgSizeH_ = maxoutConf.img_size_y(); + imgSizeH_ = maxoutConf.image_conf().img_size_y(); } if (imgSizeW_ == 0) { - imgSizeW_ = maxoutConf.img_size_x(); + imgSizeW_ = maxoutConf.image_conf().img_size(); } featLen_ = imgSizeH_ * imgSizeW_; @@ -50,7 +50,7 @@ bool MaxOutLayer::init(const LayerMap& layerMap, const MaxOutConfig& conf = config_.inputs(0).maxout_conf(); groups_ = conf.groups(); - channels_ = conf.channels(); + channels_ = conf.image_conf().channels(); CHECK_EQ(channels_ % groups_, 0UL); outputChannels_ = channels_ / groups_; diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp index ad8b92d2f..b02a542a5 100644 --- a/paddle/gserver/layers/NormLayer.cpp +++ b/paddle/gserver/layers/NormLayer.cpp @@ -49,6 +49,9 @@ bool ResponseNormLayer::init(const LayerMap& layerMap, outputX_ = conf.output_x(); imgSize_ = conf.img_size(); denoms_ = NULL; + + outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x(); + imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); return true; } diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h index 2b05be6fc..9e4acffd1 100644 --- a/paddle/gserver/layers/NormLayer.h +++ b/paddle/gserver/layers/NormLayer.h @@ -50,7 +50,7 @@ public: */ class ResponseNormLayer : public NormLayer { protected: - size_t channels_, size_, outputX_, imgSize_; + size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_; float scale_, pow_; MatrixPtr denoms_; diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp index eab6e904e..e33b985fa 100644 --- a/paddle/gserver/layers/NormProjectionLayer.cpp +++ b/paddle/gserver/layers/NormProjectionLayer.cpp @@ -24,7 +24,7 @@ size_t CMRProjectionNormLayer::getSize() { imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight(); imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth(); if (imgSizeH_ == 0) { - imgSizeH_ = imgSize_; + imgSizeH_ = imgSizeY_; } if (imgSizeW_ == 0) { imgSizeW_ = imgSize_; diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp index 2fcfc8e1a..2675f9540 100644 --- a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp +++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp @@ -56,14 +56,14 @@ ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW, size_t SpatialPyramidPoolLayer::getSize() { CHECK_EQ(inputLayers_.size(), 1UL); size_t layerSize = 0; - const SppConfig& sppConf = config_.inputs(0).spp_conf(); + const ImageConfig& conf = config_.inputs(0).spp_conf().image_conf(); imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight(); imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth(); if (imgSizeH_ == 0) { - imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_; + imgSizeH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); } if (imgSizeW_ == 0) { - imgSizeW_ = sppConf.img_size(); + imgSizeW_ = conf.img_size(); } size_t outputH = 1; @@ -82,9 +82,10 @@ bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap, pyramidHeight_ = sppConf.pyramid_height(); poolType_ = sppConf.pool_type(); - channels_ = sppConf.channels(); - imgSizeW_ = sppConf.img_size(); - imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_; + const ImageConfig& imageConf = sppConf.image_conf(); + channels_ = imageConf.channels(); + imgSizeW_ = imageConf.img_size(); + imgSizeH_ = imageConf.has_img_size_y() ? imageConf.img_size_y() : imgSizeW_; poolProjections_.reserve(pyramidHeight_); projCol_.reserve(pyramidHeight_); projOutput_.resize(pyramidHeight_); diff --git a/paddle/gserver/tests/img_pool_a.conf b/paddle/gserver/tests/img_pool_a.conf index 5938e7611..9bd046b53 100644 --- a/paddle/gserver/tests/img_pool_a.conf +++ b/paddle/gserver/tests/img_pool_a.conf @@ -28,7 +28,6 @@ maxpool = img_pool_layer(input=conv, stride_y=2, padding=1, padding_y=2, - img_width=16, pool_type=MaxPooling(), ) avgpool = img_pool_layer(input=conv, @@ -39,7 +38,6 @@ avgpool = img_pool_layer(input=conv, stride_y=2, padding=1, padding_y=2, - img_width=16, pool_type=AvgPooling(), ) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index a79dfe39c..e83985109 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -194,9 +194,10 @@ TEST(Layer, BilinearInterpLayer) { LayerInputConfig* input = config.layerConfig.add_inputs(); BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf(); - bilinear->set_img_size_x(32); - bilinear->set_img_size_y(32); - bilinear->set_num_channels(4); + ImageConfig* image = bilinear->mutable_image_conf(); + image->set_img_size(32); + image->set_img_size_y(32); + image->set_channels(4); for (auto useGpu : {false, true}) { for (auto outSize : {32, 64}) { @@ -314,7 +315,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) { config.layerConfig.set_partial_sum(1); config.layerConfig.set_shared_biases(true); - config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 288}); + config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288}); LayerInputConfig* input = config.layerConfig.add_inputs(); ConvConfig* conv = input->mutable_conv_conf(); conv->set_filter_size(2); @@ -327,10 +328,14 @@ void testConvLayer(const string& type, bool trans, bool useGpu) { conv->set_groups(1); conv->set_filter_channels(conv->channels() / conv->groups()); conv->set_img_size(16); + conv->set_img_size_y(8); conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(), conv->padding(), conv->stride(), /* caffeMode */ true)); - config.layerConfig.set_size(conv->output_x() * conv->output_x() * + conv->set_output_y(outputSize(conv->img_size_y(), conv->filter_size_y(), + conv->padding_y(), conv->stride_y(), + /* caffeMode */ true)); + config.layerConfig.set_size(conv->output_x() * conv->output_y() * config.layerConfig.num_filters()); testLayerGrad(config, "conv", 100, trans, useGpu); @@ -427,10 +432,11 @@ TEST(Layer, maxoutLayer) { config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0}); LayerInputConfig* input = config.layerConfig.add_inputs(); MaxOutConfig* maxout = input->mutable_maxout_conf(); + ImageConfig* image = maxout->mutable_image_conf(); - maxout->set_img_size_x(32); - maxout->set_img_size_y(32); - maxout->set_channels(4); + image->set_img_size(32); + image->set_img_size_y(32); + image->set_channels(4); maxout->set_groups(2); for (auto useGpu : {false, true}) { @@ -902,7 +908,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) { config.layerConfig.set_type("norm"); config.layerConfig.set_active_type("relu"); - config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0}); + config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0}); LayerInputConfig* input = config.layerConfig.add_inputs(); NormConfig* norm = input->mutable_norm_conf(); norm->set_norm_type(normType); @@ -912,7 +918,9 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) { norm->set_pow(0.75); norm->set_blocked(0); norm->set_img_size(14); + norm->set_img_size_y(7); norm->set_output_x(norm->img_size()); + norm->set_output_y(norm->img_size_y()); if (norm->norm_type() == "cmrnorm" || norm->norm_type() == "cmrnorm-projection") { norm->set_scale(norm->scale() / norm->size()); @@ -920,7 +928,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) { norm->set_scale(norm->scale() / (norm->size() * norm->size())); } - config.layerConfig.set_size(norm->output_x() * norm->output_x() * + config.layerConfig.set_size(norm->output_x() * norm->output_y() * norm->channels()); config.biasSize = 0; @@ -1018,11 +1026,12 @@ void testSppLayer(const string& poolType, const int pyramidHeight, bool trans, SppConfig* sppConfig = input->mutable_spp_conf(); sppConfig->set_pool_type(poolType); sppConfig->set_pyramid_height(pyramidHeight); - sppConfig->set_channels(16); - sppConfig->set_img_size(10); - sppConfig->set_img_size_y(20); + ImageConfig* imageConfig = sppConfig->mutable_image_conf(); + imageConfig->set_channels(16); + imageConfig->set_img_size(10); + imageConfig->set_img_size_y(20); int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1); - config.layerConfig.set_size(outputSize * sppConfig->channels()); + config.layerConfig.set_size(outputSize * imageConfig->channels()); testLayerGrad(config, "spp", 100, trans, useGpu); } @@ -1328,12 +1337,13 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) { TestConfig config; const int CHANNELS = 10; const int IMG_SIZE = 16; + const int IMG_SIZE_Y = 8; + size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y; config.layerConfig.set_type(type); - config.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE); + config.layerConfig.set_size(size); config.layerConfig.set_active_type("sigmoid"); config.biasSize = CHANNELS; - config.inputDefs.push_back({INPUT_DATA, "layer_0", - /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS, + config.inputDefs.push_back({INPUT_DATA, "layer_0", /* dim= */ size, /* paraSize= */ CHANNELS}); config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS}); @@ -1348,6 +1358,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) { ImageConfig* img_conf = input->mutable_image_conf(); img_conf->set_channels(CHANNELS); img_conf->set_img_size(IMG_SIZE); + img_conf->set_img_size_y(IMG_SIZE_Y); testLayerGrad(config, "batch_norm", 64, /* trans= */ trans, useGpu, /* useWeight */ true); @@ -1370,6 +1381,7 @@ TEST(Operator, conv) { const int FILTER_SIZE_Y = 3; const int CHANNELS = 3; const int IMAGE_SIZE = 16; + const int IMAGE_SIZE_Y = 8; OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs(); operatorConf.set_type("conv"); ConvConfig* conv = operatorConf.mutable_conv_conf(); @@ -1384,17 +1396,18 @@ TEST(Operator, conv) { conv->set_groups(1); conv->set_filter_channels(conv->channels() / conv->groups()); conv->set_img_size(IMAGE_SIZE); - int output_x = - outputSize(conv->img_size(), conv->filter_size(), conv->padding(), - conv->stride(), /* caffeMode */ true); - conv->set_output_x(output_x); - config.layerConfig.set_size(output_x * output_x * - config.layerConfig.num_filters()); - config.layerConfig.set_size(conv->output_x() * conv->output_x() * + conv->set_img_size_y(IMAGE_SIZE_Y); + conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(), + conv->padding(), conv->stride(), + /* caffeMode */ true)); + conv->set_output_y(outputSize(conv->img_size_y(), conv->filter_size_y(), + conv->padding_y(), conv->stride_y(), + /* caffeMode */ true)); + config.layerConfig.set_size(conv->output_x() * conv->output_y() * NUM_FILTERS); config.inputDefs.push_back( - {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE * CHANNELS, 0}); + {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0}); config.inputDefs.push_back( {INPUT_DATA, "layer_1", FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS, 0}); diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 42c74661d..2d5cd29ae 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -203,6 +203,8 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu, } resizeAndCopy(udp, src.udp, useGpu, stream); resizeAndCopy(strs, src.strs, useGpu, stream); + frameWidth = src.frameWidth; + frameHeight = src.frameHeight; } int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf index 664e18cb9..2a4548896 100644 --- a/paddle/trainer/tests/test_config.conf +++ b/paddle/trainer/tests/test_config.conf @@ -59,7 +59,6 @@ pool = img_pool_layer(input=fc2, padding_y=2, stride=2, stride_y=3, - img_width=3, pool_type=CudnnAvgPooling()) concat = concat_layer(input=[fc3, fc4]) diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4 index aea77248c..3c35075a9 100644 --- a/proto/ModelConfig.proto.m4 +++ b/proto/ModelConfig.proto.m4 @@ -76,6 +76,12 @@ message ConvConfig { required uint32 filter_size_y = 10; required uint32 padding_y = 11; required uint32 stride_y = 12; + + // if not set, use output_x + optional uint32 output_y = 13 [default = 0]; + + // if not set, use img_size + optional uint32 img_size_y = 14 [default = 0]; } message PoolConfig { @@ -121,11 +127,9 @@ message PoolConfig { } message SppConfig { - required string pool_type = 1; - required uint32 pyramid_height = 2; - required uint32 channels = 3; - required uint32 img_size = 4; - optional uint32 img_size_y = 5; + required ImageConfig image_conf = 1; + required string pool_type = 2; + required uint32 pyramid_height = 3; } message NormConfig { @@ -155,6 +159,12 @@ message NormConfig { // fixed window: shared a fixed window for each value // sliding window: have a different window for each value optional bool blocked = 8; + + // if not set, use output_x + optional uint32 output_y = 9 [default = 0]; + + // if not set, use img_size + optional uint32 img_size_y = 10 [default = 0]; } message BlockExpandConfig { @@ -179,12 +189,8 @@ message BlockExpandConfig { } message MaxOutConfig { - required uint32 channels = 1; + required ImageConfig image_conf = 1; required uint32 groups = 2; - - // The size of input feature map. - required uint32 img_size_x = 3; - required uint32 img_size_y = 4; } message ProjectionConfig { @@ -225,12 +231,10 @@ message OperatorConfig { message BilinearInterpConfig { // The size of input feature map. - optional uint32 img_size_x = 1; - optional uint32 img_size_y = 2; + required ImageConfig image_conf = 1; // The size of output feature map. - required uint32 out_size_x = 3; - required uint32 out_size_y = 4; - required uint32 num_channels = 5; + required uint32 out_size_x = 2; + required uint32 out_size_y = 3; } message ImageConfig { @@ -240,6 +244,7 @@ message ImageConfig { // The size of input feature map. required uint32 img_size = 8; + required uint32 img_size_y = 9; } message LayerInputConfig { @@ -412,7 +417,10 @@ sinclude(`ModelConfigLayer.proto.m4') // string type is used for flexibility: different types can be converted // to string and reinterpreted in the user's own layer implementation. optional string user_arg = 49; - + + // to indicate rectangle image data + optional uint64 height = 50; + optional uint64 width = 51; } message EvaluatorConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index dbe2f3b29..a7ad40e48 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -138,7 +138,14 @@ def init_config_environment( g_root_submodel=None, g_submodel_map={}, g_submodel_stack=[], - g_add_submodel_suffix=False, ): + g_add_submodel_suffix=False, + + # Whether current layer needs to pass the image height and width. + # Default value is true, but if it encounters recurrent_layer_group, + # it will be false. The reason is that image is converted to be sequence, + # image height will be sequence length, and image width will be feature + # length of each timestep. + g_pass_height_width=True, ): for k, v in locals().iteritems(): globals()[k] = copy.deepcopy(v) @@ -592,6 +599,7 @@ class DotMulProjection(Projection): def calc_parameter_dims(self, input_size, output_size): return [1, output_size] + # ScalingProjection @config_class class ScalingProjection(Projection): @@ -685,9 +693,9 @@ class ConvProjection(Projection): parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf, num_filters) - # TODO: support rectangle input - self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x - **2) * num_filters + self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \ + self.proj_conf.conv_conf.output_y * \ + num_filters def calc_output_size(self, input_layer_config): return self.proj_conf.output_size @@ -762,8 +770,9 @@ class ConvOperator(Operator): parse_conv(conv_conf, MakeLayerNameInSubmodel(input_layer_names[0]), self.operator_conf.conv_conf, num_filters) - self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x - **2) * num_filters + self.operator_conf.output_size = self.operator_conf.conv_conf.output_x * \ + self.operator_conf.conv_conf.output_y * \ + num_filters config_assert(len(input_layer_names) == 2, "Conv is binary operator") @@ -798,14 +807,12 @@ class Conv(Cfg): config_assert(output_x <= 0) -# please refer to the comments in proto/ModelConfig.proto @config_class class BilinearInterp(Cfg): - def __init__(self, out_size_x=None, out_size_y=None, num_channels=None): + def __init__(self, out_size_x=None, out_size_y=None, channels=None): self.add_keys(locals()) -# please refer to the comments in proto/ModelConfig.proto @config_class class Pool(Cfg): def __init__(self, @@ -813,7 +820,6 @@ class Pool(Cfg): channels, size_x, size_y=None, - img_width=None, start=None, stride=None, stride_y=None, @@ -822,14 +828,12 @@ class Pool(Cfg): self.add_keys(locals()) -# please refer to the comments in proto/ModelConfig.proto @config_class class SpatialPyramidPool(Cfg): - def __init__(self, pool_type, pyramid_height, channels, img_width=None): + def __init__(self, pool_type, pyramid_height, channels): self.add_keys(locals()) -# please refer to the comments in proto/ModelConfig.proto @config_class class Norm(Cfg): def __init__(self, @@ -844,7 +848,6 @@ class Norm(Cfg): self.add_keys(locals()) -# please refer to the comments in proto/ModelConfig.proto @config_class class Image(Cfg): def __init__(self, channels, img_size=None): @@ -1051,18 +1054,8 @@ def TestData(data_config, async_load_data=None): g_config.test_data_config.async_load_data = async_load_data -def parse_bilinear(bilinear, input_layer_name, bilinear_conf): - bilinear_conf.out_size_x = bilinear.out_size_x - bilinear_conf.out_size_y = bilinear.out_size_y - bilinear_conf.num_channels = bilinear.num_channels - - -''' -caffe_mode: compute the output size using floor instead of ceil, - which is consistent of caffe and CuDNN's convention. -''' - - +#caffe_mode: compute the output size using floor instead of ceil, +# which is consistent of caffe and CuDNN's convention. def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode): output = (2 * padding + img_size - filter_size) / float(stride) if caffe_mode: @@ -1071,20 +1064,34 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode): return 1 + int(math.ceil(output)) -''' -calcualte image_size based on output_size for convolution. -It is the reverse function of cnn_output_size -''' - - +#calcualte image_size based on output_size for convolution. +#It is the reverse function of cnn_output_size def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode): - if caffe_mode: - img_size = (output_size - 1) * stride + filter_size - 2 * padding - else: - img_size = (output_size - 2) * stride + filter_size - 2 * padding + 1 + img_size = (output_size - 1) * stride + filter_size - 2 * padding + if not caffe_mode: + img_size = img_size + 1 return img_size +def set_img_size(input_layer_name, channels): + input = g_layer_map[input_layer_name] + img_pixels = input.size / channels + img_size = input.width if input.width > 0 else int(img_pixels**0.5) + img_size_y = input.height if input.height > 0 else int(img_pixels / + img_size) + config_assert( + img_size * img_size_y == img_pixels, + "Input layer %s: Incorrect input image size %d * %d for input image pixels %d" + % (input_layer_name, img_size, img_size_y, img_pixels)) + return img_size, img_size_y + + +def parse_bilinear(bilinear, input_layer_name, bilinear_conf): + parse_image(bilinear, input_layer_name, bilinear_conf.image_conf) + bilinear_conf.out_size_x = bilinear.out_size_x + bilinear_conf.out_size_y = bilinear.out_size_y + + def parse_pool(pool, input_layer_name, pool_conf): pool_conf.pool_type = pool.pool_type config_assert(pool.pool_type in [ @@ -1100,14 +1107,8 @@ def parse_pool(pool, input_layer_name, pool_conf): pool_conf.size_y = default(pool.size_y, pool_conf.size_x) pool_conf.stride_y = default(pool.stride_y, pool_conf.stride) - img_pixels = g_layer_map[input_layer_name].size / pool.channels - # the img_width may be removed, - # and it can be calculated automatically later. - pool_conf.img_size = default(pool.img_width, int(img_pixels**0.5)) - pool_conf.img_size_y = img_pixels / pool_conf.img_size - config_assert(pool_conf.img_size * pool_conf.img_size_y == img_pixels, - "Incorrect input image size %d for input image pixels %d" % - (pool_conf.img_size, img_pixels)) + pool_conf.img_size, pool_conf.img_size_y = \ + set_img_size(input_layer_name, pool.channels) config_assert(not pool.start, "start is deprecated in pooling.") @@ -1123,29 +1124,18 @@ def parse_pool(pool, input_layer_name, pool_conf): def parse_spp(spp, input_layer_name, spp_conf): + parse_image(spp, input_layer_name, spp_conf.image_conf) spp_conf.pool_type = spp.pool_type config_assert(spp.pool_type in ['max-projection', 'avg-projection'], "pool-type %s is not in " "['max-projection', 'avg-projection']" % spp.pool_type) spp_conf.pyramid_height = spp.pyramid_height - spp_conf.channels = spp.channels - - img_pixels = g_layer_map[input_layer_name].size / spp_conf.channels - - spp_conf.img_size = default(spp.img_width, int(img_pixels**0.5)) - spp_conf.img_size_y = img_pixels / spp_conf.img_size - config_assert(spp_conf.img_size * spp_conf.img_size_y == img_pixels, - "Incorrect input image size %d for input image pixels %d" % - (spp_conf.img_size, img_pixels)) def parse_image(image, input_layer_name, image_conf): image_conf.channels = image.channels - image_pixels = g_layer_map[input_layer_name].size / image_conf.channels - image_conf.img_size = int(image_pixels**0.5) - config_assert((image_conf.img_size**2) == image_pixels, - "Incorrect input image size %d for input image pixels %d" % - (image_conf.img_size, image_pixels)) + image_conf.img_size, image_conf.img_size_y = \ + set_img_size(input_layer_name, image_conf.channels) def parse_norm(norm, input_layer_name, norm_conf): @@ -1159,24 +1149,18 @@ def parse_norm(norm, input_layer_name, norm_conf): norm_conf.pow = norm.pow norm_conf.blocked = norm.blocked - img_pixels = g_layer_map[input_layer_name].size / norm.channels - norm_conf.img_size = int(img_pixels**0.5) - config_assert((norm_conf.img_size**2) == img_pixels, - "Incorrect input image size %d for input image pixels %d" % - (norm_conf.img_size, img_pixels)) + norm_conf.img_size, norm_conf.img_size_y = \ + set_img_size(input_layer_name, norm.channels) norm_conf.output_x = norm_conf.img_size + norm_conf.output_y = norm_conf.img_size_y if norm.norm_type in ['cmrnorm-projection']: norm_conf.scale /= norm.size else: norm_conf.scale /= norm.size**2 -''' -caffe_mode: compute the output size using floor instead of ceil, - which is consistent of caffe and CuDNN's convention. -''' - - +#caffe_mode: compute the output size using floor instead of ceil, +# which is consistent of caffe and CuDNN's convention. def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False): conv_conf.filter_size = conv.filter_size conv_conf.filter_size_y = conv.filter_size_y @@ -1190,33 +1174,24 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False): if not trans: conv_conf.filter_channels = conv.channels / conv.groups - - img_pixels = g_layer_map[input_layer_name].size / conv.channels - print('channels=%d size=%d' % (conv.channels, - g_layer_map[input_layer_name].size)) - conv_conf.img_size = int(img_pixels**0.5) - config_assert((conv_conf.img_size**2) == img_pixels, ( - "Input layer %s: Incorrect input image size %d for input " + - "image pixels %d") % - (input_layer_name, conv_conf.img_size, img_pixels)) - + conv_conf.img_size, conv_conf.img_size_y = \ + set_img_size(input_layer_name, conv.channels) conv_conf.output_x = cnn_output_size( conv_conf.img_size, conv_conf.filter_size, conv_conf.padding, conv_conf.stride, conv_conf.caffe_mode) + conv_conf.output_y = cnn_output_size( + conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y, + conv_conf.stride_y, conv_conf.caffe_mode) else: conv_conf.filter_channels = num_filters / conv.groups - - outputSize = g_layer_map[input_layer_name].size / conv.channels - print('channels=%d size=%d' % (conv.channels, - g_layer_map[input_layer_name].size)) - conv_conf.output_x = int(outputSize**0.5) - config_assert((conv_conf.output_x**2) == outputSize, ( - "Input layer %s: Incorrect input image size %d for input " + - "image pixels %d") % - (input_layer_name, conv_conf.output_x, outputSize)) + conv_conf.output_x, conv_conf.output_y = \ + set_img_size(input_layer_name, conv.channels) conv_conf.img_size = cnn_image_size( conv_conf.output_x, conv_conf.filter_size, conv_conf.padding, conv_conf.stride, conv_conf.caffe_mode) + conv_conf.img_size_y = cnn_output_size( + conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y, + conv_conf.stride_y, conv_conf.caffe_mode) def parse_block_expand(block_expand, input_layer_name, block_expand_conf): @@ -1245,10 +1220,8 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf): def parse_maxout(maxout, input_layer_name, maxout_conf): - maxout_conf.channels = maxout.channels + parse_image(maxout, input_layer_name, maxout_conf.image_conf) maxout_conf.groups = maxout.groups - maxout_conf.img_size_x = maxout.img_size_x - maxout_conf.img_size_y = maxout.img_size_y # Define an evaluator @@ -1375,6 +1348,12 @@ class LayerBase(object): g_current_submodel.layer_names.append(self.config.name) + if self.config.type != 'data' and g_pass_height_width: + height = self.get_input_layer(0).height + width = self.get_input_layer(0).width + if height and width: + self.set_layer_height_width(height, width) + def get_input_layer(self, input_index): return g_layer_map[self.config.inputs[input_index].input_layer_name] @@ -1492,6 +1471,23 @@ class LayerBase(object): 'Different inputs result in' + 'different layer size at layer %s' % self.config.name) + def set_layer_height_width(self, height, width): + self.config.height = height + self.config.width = width + + def set_cnn_layer(self, + input_layer_name, + height, + width, + channels, + is_print=True): + size = height * width * channels + self.set_layer_size(size) + self.set_layer_height_width(height, width) + if is_print: + print("output for %s: c = %d, h = %d, w = %d, size = %d" % + (input_layer_name, channels, height, width, size)) + @config_layer('multi_class_cross_entropy_with_selfnorm') class MultiClassCrossEntropySelfNormCostLayer(LayerBase): @@ -1581,9 +1577,11 @@ class PrintLayer(LayerBase): @config_layer('data') class DataLayer(LayerBase): - def __init__(self, name, size, device=None): + def __init__(self, name, size, height=None, width=None, device=None): super(DataLayer, self).__init__( name, 'data', size, inputs=[], device=device) + if height and width: + self.set_layer_height_width(height, width) ''' @@ -1682,14 +1680,13 @@ class ConvLayerBase(LayerBase): for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) - parse_conv(self.inputs[input_index].conv, input_layer.name, - self.config.inputs[input_index].conv_conf, num_filters) conv_conf = self.config.inputs[input_index].conv_conf + parse_conv(self.inputs[input_index].conv, input_layer.name, + conv_conf, num_filters) psize = self.calc_parameter_size(conv_conf) - print("output size for %s is %d " % (name, conv_conf.output_x)) self.create_input_parameter(input_index, psize) - self.set_layer_size( - (conv_conf.output_x**2) * self.config.num_filters) + self.set_cnn_layer(name, conv_conf.output_y, conv_conf.output_x, + self.config.num_filters) psize = self.config.size if shared_biases: @@ -1776,10 +1773,11 @@ class NormLayer(LayerBase): name, 'norm', 0, inputs=inputs, device=device) for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) - parse_norm(self.inputs[input_index].norm, input_layer.name, - self.config.inputs[input_index].norm_conf) norm_conf = self.config.inputs[input_index].norm_conf - self.set_layer_size((norm_conf.output_x**2) * norm_conf.channels) + parse_norm(self.inputs[input_index].norm, input_layer.name, + norm_conf) + self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x, + norm_conf.channels, False) @config_layer('pool') @@ -1789,13 +1787,11 @@ class PoolLayer(LayerBase): name, 'pool', 0, inputs=inputs, device=device) for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) - parse_pool(self.inputs[input_index].pool, input_layer.name, - self.config.inputs[input_index].pool_conf) pool_conf = self.config.inputs[input_index].pool_conf - print("output size for %s is %d*%d " % (name, pool_conf.output_y, - pool_conf.output_x)) - self.set_layer_size( - (pool_conf.output_x * pool_conf.output_y) * pool_conf.channels) + parse_pool(self.inputs[input_index].pool, input_layer.name, + pool_conf) + self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x, + pool_conf.channels) @config_layer('spp') @@ -1805,12 +1801,10 @@ class SpatialPyramidPoolLayer(LayerBase): name, 'spp', 0, inputs=inputs, device=device) for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) - parse_spp(self.inputs[input_index].spp, input_layer.name, - self.config.inputs[input_index].spp_conf) spp_conf = self.config.inputs[input_index].spp_conf - output_size = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1) - print("output size for %s is %d " % (name, output_size)) - self.set_layer_size(output_size * spp_conf.channels) + parse_spp(self.inputs[input_index].spp, input_layer.name, spp_conf) + output_x = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1) + self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels) @config_layer('batch_norm') @@ -1872,10 +1866,10 @@ class BatchNormLayer(LayerBase): self.config.moving_average_fraction = moving_average_fraction input_layer = self.get_input_layer(0) - parse_image(self.inputs[0].image, input_layer.name, - self.config.inputs[0].image_conf) image_conf = self.config.inputs[0].image_conf - self.set_layer_size((image_conf.img_size**2) * image_conf.channels) + parse_image(self.inputs[0].image, input_layer.name, image_conf) + self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size, + image_conf.channels) psize = self.calc_parameter_size(image_conf) dims = [1, psize] @@ -1933,11 +1927,12 @@ class MaxOutLayer(LayerBase): super(MaxOutLayer, self).__init__( name, 'maxout', 0, inputs=inputs, **xargs) input_layer = self.get_input_layer(0) - parse_maxout(self.inputs[0].maxout, input_layer.name, - self.config.inputs[0].maxout_conf) maxout_conf = self.config.inputs[0].maxout_conf + parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf) self.set_layer_size(g_layer_map[input_layer.name].size / maxout_conf.groups) + self.set_layer_height_width(g_layer_map[input_layer.name].height, + g_layer_map[input_layer.name].width) # key: cost type @@ -2517,11 +2512,10 @@ class BilinearInterpLayer(LayerBase): super(BilinearInterpLayer, self).__init__( name, 'bilinear_interp', 0, inputs=inputs, **xargs) input_layer = self.get_input_layer(0) - parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name, - self.config.inputs[0].bilinear_interp_conf) - conf = self.inputs[0].bilinear_interp - self.set_layer_size(conf.out_size_x * conf.out_size_y * - conf.num_channels) + conf = self.config.inputs[0].bilinear_interp_conf + parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name, conf) + self.set_cnn_layer(name, conf.out_size_y, conf.out_size_x, + conf.image_conf.channels) @config_layer('sum_to_one_norm') @@ -2994,6 +2988,8 @@ class CTCLayer(LayerBase): @config_layer('recurrent_layer_group') class RecurrentLayerGroup(LayerBase): def __init__(self, name, device=None): + global g_pass_height_width + g_pass_height_width = False super(RecurrentLayerGroup, self).__init__( name, 'recurrent_layer_group', 0, inputs=[], device=device) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index d984e8432..fbb28e6ca 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -763,7 +763,7 @@ def mixed_layer(size=0, @layer_support() -def data_layer(name, size, layer_attr=None): +def data_layer(name, size, height=None, width=None, layer_attr=None): """ Define DataLayer For NeuralNetwork. @@ -778,6 +778,10 @@ def data_layer(name, size, layer_attr=None): :type name: basestring :param size: Size of this data layer. :type size: int + :param height: Height of this data layer, used for image + :type size: int|None + :param width: Width of this data layer, used for image + :type size: int|None :param layer_attr: Extra Layer Attribute. :type layer_attr: ExtraLayerAttribute. :return: LayerOutput object. @@ -787,6 +791,8 @@ def data_layer(name, size, layer_attr=None): type=LayerType.DATA, name=name, size=size, + height=height, + width=width, **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput(name, LayerType.DATA, size=size) @@ -1480,7 +1486,7 @@ def bilinear_interp_layer(input, bilinear_interp=BilinearInterp( out_size_x=out_size_x, out_size_y=out_size_y, - num_channels=num_channels)), + channels=num_channels)), type=LayerType.BILINEAR_INTERP_LAYER, **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput( @@ -1908,8 +1914,7 @@ def img_pool_layer(input, layer_attr=None, pool_size_y=None, stride_y=None, - padding_y=None, - img_width=None): + padding_y=None): """ Image pooling Layer. @@ -1940,9 +1945,6 @@ def img_pool_layer(input, :type stride_y: int|None :param layer_attr: Extra Layer attribute. :type layer_attr: ExtraLayerAttribute - :param img_width: the width of input feature map. If it is None, the input feature - map should be square. - :type img_width: int|None :return: LayerOutput object. :rtype: LayerOutput """ @@ -1978,8 +1980,7 @@ def img_pool_layer(input, padding=padding, size_y=pool_size_y, stride_y=stride_y, - padding_y=padding_y, - img_width=img_width)) + padding_y=padding_y)) ], **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput( @@ -1997,7 +1998,6 @@ def spp_layer(input, num_channels=None, pool_type=None, pyramid_height=None, - img_width=None, layer_attr=None): """ Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition. @@ -2014,9 +2014,6 @@ def spp_layer(input, :type scale: BasePoolingType :param pyramid_height: pyramid height. :type pyramid_height: int - :param img_width: the width of input feature map. If it is None, the input feature - map should be square. - :type img_width: int|None :param layer_attr: Extra Layer Attribute. :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. @@ -2043,8 +2040,7 @@ def spp_layer(input, spp=SpatialPyramidPool( pool_type=type_name, channels=num_channels, - pyramid_height=pyramid_height, - img_width=img_width)), + pyramid_height=pyramid_height)), **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput( name, diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr index 1f262af21..1a577b8d9 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr @@ -26,11 +26,15 @@ layers { filter_size_y: 32 padding_y: 1 stride_y: 1 + output_y: 227 + img_size_y: 256 } } bias_parameter_name: "___conv_0__.wbias" num_filters: 64 shared_biases: true + height: 227 + width: 227 } layers { name: "__batch_norm_0__" @@ -43,6 +47,7 @@ layers { image_conf { channels: 64 img_size: 227 + img_size_y: 227 } } inputs { @@ -55,6 +60,8 @@ layers { } bias_parameter_name: "___batch_norm_0__.wbias" moving_average_fraction: 0.9 + height: 227 + width: 227 } layers { name: "__crmnorm_0__" @@ -72,8 +79,12 @@ layers { output_x: 227 img_size: 227 blocked: false + output_y: 227 + img_size_y: 227 } } + height: 227 + width: 227 } layers { name: "__pool_0__" @@ -97,6 +108,8 @@ layers { padding_y: 0 } } + height: 196 + width: 196 } parameters { name: "___conv_0__.w0" diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr index 383463540..ac1e2adff 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr @@ -26,6 +26,8 @@ layers { filter_size_y: 32 padding_y: 1 stride_y: 1 + output_y: 227 + img_size_y: 198 } } bias_parameter_name: "___conv_0__.wbias" @@ -43,6 +45,7 @@ layers { image_conf { channels: 64 img_size: 256 + img_size_y: 256 } } inputs { @@ -55,6 +58,8 @@ layers { } bias_parameter_name: "___batch_norm_0__.wbias" moving_average_fraction: 0.9 + height: 256 + width: 256 } layers { name: "__crmnorm_0__" @@ -72,8 +77,12 @@ layers { output_x: 256 img_size: 256 blocked: false + output_y: 256 + img_size_y: 256 } } + height: 256 + width: 256 } layers { name: "__pool_0__" @@ -97,6 +106,8 @@ layers { padding_y: 0 } } + height: 225 + width: 225 } parameters { name: "___conv_0__.w0" diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr index 2b3951c24..2943ab130 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr @@ -177,6 +177,8 @@ layers { filter_size_y: 3 padding_y: 0 stride_y: 1 + output_y: 30 + img_size_y: 32 } num_filters: 64 } diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr index 13d0d477e..9fae596f2 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr @@ -26,11 +26,15 @@ layers { filter_size_y: 3 padding_y: 1 stride_y: 1 + output_y: 48 + img_size_y: 48 } } bias_parameter_name: "___conv_0__.wbias" num_filters: 16 shared_biases: true + height: 48 + width: 48 } layers { name: "__bilinear_interp_layer_0__" @@ -40,11 +44,17 @@ layers { inputs { input_layer_name: "__conv_0__" bilinear_interp_conf { + image_conf { + channels: 16 + img_size: 48 + img_size_y: 48 + } out_size_x: 64 out_size_y: 64 - num_channels: 16 } } + height: 64 + width: 64 } layers { name: "__pool_0__" @@ -55,19 +65,21 @@ layers { input_layer_name: "__bilinear_interp_layer_0__" pool_conf { pool_type: "max-projection" - channels: 4 + channels: 16 size_x: 2 stride: 2 - output_x: 64 - img_size: 128 + output_x: 32 + img_size: 64 padding: 0 size_y: 2 stride_y: 2 - output_y: 64 - img_size_y: 128 + output_y: 32 + img_size_y: 64 padding_y: 0 } } + height: 32 + width: 32 } layers { name: "__fc_layer_0__" @@ -78,6 +90,8 @@ layers { input_layer_name: "__pool_0__" input_parameter_name: "___fc_layer_0__.w0" } + height: 32 + width: 32 } parameters { name: "___conv_0__.w0" diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr index 1be2a7cee..c763a95f9 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr @@ -4,6 +4,8 @@ layers { type: "data" size: 2304 active_type: "" + height: 48 + width: 48 } layers { name: "__conv_0__" @@ -26,11 +28,15 @@ layers { filter_size_y: 3 padding_y: 1 stride_y: 1 + output_y: 48 + img_size_y: 48 } } bias_parameter_name: "___conv_0__.wbias" num_filters: 16 shared_biases: true + height: 48 + width: 48 } layers { name: "__maxout_layer_0__" @@ -40,12 +46,16 @@ layers { inputs { input_layer_name: "__conv_0__" maxout_conf { - channels: 16 + image_conf { + channels: 16 + img_size: 48 + img_size_y: 48 + } groups: 2 - img_size_x: 0 - img_size_y: 0 } } + height: 48 + width: 48 } layers { name: "__pool_0__" @@ -69,48 +79,58 @@ layers { padding_y: 0 } } + height: 24 + width: 24 } layers { name: "__conv_1__" type: "exconv" - size: 18432 + size: 73728 active_type: "" inputs { input_layer_name: "__pool_0__" input_parameter_name: "___conv_1__.w0" conv_conf { filter_size: 3 - channels: 32 + channels: 8 stride: 1 padding: 1 groups: 1 - filter_channels: 32 - output_x: 12 - img_size: 12 + filter_channels: 8 + output_x: 24 + img_size: 24 caffe_mode: true filter_size_y: 3 padding_y: 1 stride_y: 1 + output_y: 24 + img_size_y: 24 } } bias_parameter_name: "___conv_1__.wbias" num_filters: 128 shared_biases: true + height: 24 + width: 24 } layers { name: "__maxout_layer_1__" type: "maxout" - size: 9216 + size: 18432 active_type: "" inputs { - input_layer_name: "__conv_0__" + input_layer_name: "__conv_1__" maxout_conf { - channels: 128 + image_conf { + channels: 128 + img_size: 24 + img_size_y: 24 + } groups: 4 - img_size_x: 0 - img_size_y: 0 } } + height: 24 + width: 24 } layers { name: "__block_expand_layer_0__" @@ -118,7 +138,7 @@ layers { size: 192 active_type: "" inputs { - input_layer_name: "__maxout_layer_0__" + input_layer_name: "__maxout_layer_1__" block_expand_conf { channels: 32 stride_x: 1 @@ -133,6 +153,8 @@ layers { img_size_y: 0 } } + height: 24 + width: 24 } layers { name: "__fc_layer_0__" @@ -143,6 +165,8 @@ layers { input_layer_name: "__block_expand_layer_0__" input_parameter_name: "___fc_layer_0__.w0" } + height: 24 + width: 24 } parameters { name: "___conv_0__.w0" @@ -164,9 +188,9 @@ parameters { } parameters { name: "___conv_1__.w0" - size: 36864 + size: 9216 initial_mean: 0.0 - initial_std: 0.0833333333333 + initial_std: 0.166666666667 initial_strategy: 0 initial_smart: false } diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr index 8b0a8f214..ca1b2d8cf 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr @@ -4,6 +4,8 @@ layers { type: "data" size: 3200 active_type: "" + height: 20 + width: 10 } layers { name: "__spp_0__" @@ -13,13 +15,17 @@ layers { inputs { input_layer_name: "data" spp_conf { + image_conf { + channels: 16 + img_size: 10 + img_size_y: 20 + } pool_type: "max-projection" pyramid_height: 2 - channels: 16 - img_size: 10 - img_size_y: 20 } } + height: 1 + width: 5 } input_layer_names: "data" output_layer_names: "__spp_0__" diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py index e15a55b41..be83f4f83 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py @@ -17,7 +17,7 @@ bilinear = bilinear_interp_layer(input=conv, out_size_x=64, out_size_y=64) pool = img_pool_layer( input=bilinear, - num_channels=4, + num_channels=16, pool_size=2, stride=2, pool_type=MaxPooling()) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py index 081430d71..eb14270ba 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py @@ -2,7 +2,7 @@ from paddle.trainer_config_helpers import * settings(batch_size=1000, learning_rate=1e-5) -data = data_layer(name='data', size=2304) +data = data_layer(name='data', size=2304, height=48, width=48) conv = img_conv_layer( input=data, @@ -21,16 +21,21 @@ pool = img_pool_layer( conv2 = img_conv_layer( input=pool, filter_size=3, - num_channels=32, + num_channels=8, num_filters=128, padding=1, act=LinearActivation(), bias_attr=True) -maxout2 = maxout_layer(input=conv, num_channels=128, groups=4) +maxout2 = maxout_layer(input=conv2, num_channels=128, groups=4) block = block_expand_layer( - input=maxout, num_channels=32, stride_x=1, stride_y=1, block_x=1, block_y=6) + input=maxout2, + num_channels=32, + stride_x=1, + stride_y=1, + block_x=1, + block_y=6) fc = fc_layer(input=block, size=384, bias_attr=False) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py index e20ffb584..e0b0d0d3b 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py @@ -2,13 +2,9 @@ from paddle.trainer_config_helpers import * settings(batch_size=100, learning_rate=1e-5) -data = data_layer(name='data', size=3200) +data = data_layer(name='data', size=3200, height=20, width=10) spp = spp_layer( - input=data, - pyramid_height=2, - num_channels=16, - pool_type=MaxPooling(), - img_width=10) + input=data, pyramid_height=2, num_channels=16, pool_type=MaxPooling()) outputs(spp) -- GitLab