diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp index 6381f20a63c6b4ca24245cd6f30e4defda279de6..fd534b2ac406d4c9a112c1098be84484f980f651 100644 --- a/paddle/gserver/layers/BatchNormBaseLayer.cpp +++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp @@ -60,14 +60,12 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap, void BatchNormBaseLayer::calFeatureMapSize() { const ImageConfig& conf = config_.inputs(0).image_conf(); - if (inputLayers_[0]->getOutput().getFrameHeight() == 0 && - inputLayers_[0]->getOutput().getFrameWidth() == 0) { - imgSize_ = conf.img_size(); - imageH_ = imgSize_; - imageW_ = imgSize_; + imageH_ = inputLayers_[0]->getOutput().getFrameHeight(); + imageW_ = inputLayers_[0]->getOutput().getFrameWidth(); + if (imageH_ == 0 && imageW_ == 0) { + imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); + imageW_ = conf.img_size(); } else { - imageH_ = inputLayers_[0]->getOutput().getFrameHeight(); - imageW_ = inputLayers_[0]->getOutput().getFrameWidth(); getOutput().setFrameHeight(imageH_); getOutput().setFrameWidth(imageW_); } diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h index d65882d39df2bb93920dad37ebc78342e31aef85..f956646a6dca7a5b053e5d034866b659d90539d0 100644 --- a/paddle/gserver/layers/BatchNormBaseLayer.h +++ b/paddle/gserver/layers/BatchNormBaseLayer.h @@ -77,9 +77,8 @@ protected: MatrixPtr savedMean_; MatrixPtr savedInvVar_; - /// Height or width of input image feature, now height is equal to width. - /// imgSize is 1 if the input is fully-connected layer. - int imgSize_; + /// Height or width of input image feature. + /// Both of them are 1 if the input is fully-connected layer. int imageH_; int imageW_; /// Height * Width. diff --git a/paddle/gserver/layers/BilinearInterpLayer.cpp b/paddle/gserver/layers/BilinearInterpLayer.cpp index c30e26dc031378ce792534c5eec6c24fc0d20ef9..11028290dcd1015c1bc51d4c34655f527f55346d 100644 --- a/paddle/gserver/layers/BilinearInterpLayer.cpp +++ b/paddle/gserver/layers/BilinearInterpLayer.cpp @@ -26,15 +26,15 @@ size_t BilinearInterpLayer::getSize() { const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf(); if (inImgH_ == 0) { - inImgH_ = conf.img_size_y(); + inImgH_ = conf.image_conf().img_size_y(); } if (inImgW_ == 0) { - inImgW_ = conf.img_size_x(); + inImgW_ = conf.image_conf().img_size(); } outImgH_ = conf.out_size_y(); outImgW_ = conf.out_size_x(); - numChannels_ = conf.num_channels(); + numChannels_ = conf.image_conf().channels(); CHECK(outImgH_ > 0 && outImgW_ > 0); CHECK(inImgH_ > 0 && inImgW_ > 0); diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp index 7637e245a38959220f0d1d52e1f705d86a7c7303..b5a2f8b8e10e6f81d06e9722c09c5d43b1620ad1 100644 --- a/paddle/gserver/layers/ConvBaseLayer.cpp +++ b/paddle/gserver/layers/ConvBaseLayer.cpp @@ -38,11 +38,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap, filterSizeY_.push_back(conf.filter_size_y()); filterPixels_.push_back(filterSize_.back() * filterSizeY_.back()); channels_.push_back(conf.channels()); - imgSizeH_.push_back(conf.img_size()); + imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y() + : conf.img_size()); imgSizeW_.push_back(conf.img_size()); groups_.push_back(conf.groups()); filterChannels_.push_back(conf.filter_channels()); - outputH_.push_back(conf.output_x()); + outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x()); outputW_.push_back(conf.output_x()); } @@ -91,16 +92,19 @@ size_t ConvBaseLayer::calOutputSize() { for (size_t i = 0; i < inputLayers_.size(); i++) { inH.push_back(inputLayers_[i]->getOutput().getFrameHeight()); inW.push_back(inputLayers_[i]->getOutput().getFrameWidth()); + const ConvConfig& conf = config_.inputs(i).conv_conf(); if (isDeconv_) { - if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().output_x(); - if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().output_x(); + if (inH[i] == 0) + inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x(); + if (inW[i] == 0) inW[i] = conf.output_x(); outH.push_back(imageSize( inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_)); outW.push_back(imageSize( inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_)); } else { - if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().img_size(); - if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().img_size(); + if (inH[i] == 0) + inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); + if (inW[i] == 0) inW[i] = conf.img_size(); outH.push_back(outputSize( inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_)); outW.push_back(outputSize( diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp index 9b8e18b1ba2a4502bcdcecade94ec3e29730595c..dc06c89dab2524d9b640bfd88f3b3f3ce0117711 100644 --- a/paddle/gserver/layers/ConvOperator.cpp +++ b/paddle/gserver/layers/ConvOperator.cpp @@ -93,9 +93,9 @@ private: bool caffeMode_; int inputOffset_, outputOffset_, weightOffset_; int numFilters_; - int padding_, stride_, filterSize_, channels_, imgSize_; + int padding_, stride_, filterSize_, channels_, imgSize_, imgSizeY_; int paddingY_, strideY_, filterSizeY_; - int imgPixels_, filterPixels_, filterChannels_, outputX_, outputs_; + int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_; /// Following member variables are same with CudnnConvLayer. /// There is no explanation here. @@ -144,7 +144,7 @@ void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) { void ConvOperator::reshape(int batchSize) { imageH_ = ins_[0]->getFrameHeight(); imageW_ = ins_[0]->getFrameWidth(); - if (imageH_ == 0) imageH_ = imgSize_; + if (imageH_ == 0) imageH_ = imgSizeY_; if (imageW_ == 0) imageW_ = imgSize_; outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_); outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_); @@ -182,7 +182,10 @@ void ConvOperator::computeConvSizes() { hl_create_tensor_descriptor(&inputDesc_); int outputX = outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_); + int outputY = + outputSize(imgSizeY_, filterSizeY_, paddingY_, strideY_, caffeMode_); CHECK_EQ(outputX, outputX_); + CHECK_EQ(outputY, outputY_); hl_create_tensor_descriptor(&outputDesc_); hl_create_convolution_descriptor(&convDesc_, inputDesc_, @@ -236,10 +239,12 @@ void ConvOperator::getConvParams() { filterPixels_ = filterSize_ * filterSizeY_; channels_ = conf.channels(); imgSize_ = conf.img_size(); - imgPixels_ = imgSize_ * imgSize_; + imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); + imgPixels_ = imgSize_ * imgSizeY_; CHECK_EQ(conf.groups(), 1U); filterChannels_ = conf.filter_channels(); outputX_ = conf.output_x(); + outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x(); outputs_ = outputX_ * outputX_; } diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp index 946d249dc5d9a53cc970a54e6189786b3ae358c1..5a68fb08da3d742fe6067a8be00b831230e6b0af 100644 --- a/paddle/gserver/layers/ConvProjection.cpp +++ b/paddle/gserver/layers/ConvProjection.cpp @@ -46,7 +46,7 @@ void ConvProjection::getConvParams() { filterH_ = conf.filter_size_y(); filterW_ = conf.filter_size(); - configImgH_ = conf.img_size(); + configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); configImgW_ = conf.img_size(); channels_ = conf.channels(); @@ -58,9 +58,11 @@ void ConvProjection::getConvParams() { } void ConvProjection::initCudnn() { - hl_create_filter_descriptor( - &filterDesc_, channels_ / groups_, numFilters_ / groups_, - filterH_, filterW_); + hl_create_filter_descriptor(&filterDesc_, + channels_ / groups_, + numFilters_ / groups_, + filterH_, + filterW_); hl_create_tensor_descriptor(&inputDesc_); hl_create_tensor_descriptor(&outputDesc_); hl_create_convolution_descriptor(&convDesc_, diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/gserver/layers/DataLayer.cpp index 9a4b2e9d3e256119f3ff24cfcb80d68c81f67c65..67c49230367d8597860e3c32df434a16944f5daa 100644 --- a/paddle/gserver/layers/DataLayer.cpp +++ b/paddle/gserver/layers/DataLayer.cpp @@ -49,8 +49,13 @@ void DataLayer::copyDataToOutput(Argument& output) { output.ids->copyFrom(*data_.ids); } } - output.setFrameHeight(data_.getFrameHeight()); - output.setFrameWidth(data_.getFrameWidth()); + if (config_.height() && config_.width()) { + output.setFrameHeight(config_.height()); + output.setFrameWidth(config_.width()); + } else { + output.setFrameHeight(data_.getFrameHeight()); + output.setFrameHeight(data_.getFrameHeight()); + } output.cpuSequenceDims = data_.cpuSequenceDims; output.sequenceStartPositions = data_.sequenceStartPositions; output.subSequenceStartPositions = data_.subSequenceStartPositions; diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp index a9b5b916a1f0d22ff46dc6795053f44e3e3af09e..3724609720c97b66d7d1779a0c892628c5d13a44 100644 --- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp +++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp @@ -29,17 +29,19 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap, * meaning as in conv, we need to swap channels_ and numFilters here for * convTrans, and in other functions too. * */ - int channel; - int numFilters; + /* Initialize the projection */ for (auto &inputConfig : config_.inputs()) { const ConvConfig &conf = inputConfig.conv_conf(); - numFilters = isDeconv_ ? conf.channels() : numFilters_; + int numFilters = isDeconv_ ? conf.channels() : numFilters_; subM_.push_back(numFilters / conf.groups()); - subN_.push_back(conf.output_x() * conf.output_x()); - channel = isDeconv_ ? numFilters_ : conf.channels(); - subK_.push_back(channel * conf.filter_size() * conf.filter_size() / - conf.groups()); + subN_.push_back(conf.output_x() * + (conf.has_output_y() ? conf.output_y() : conf.output_x())); + int channel = isDeconv_ ? numFilters_ : conf.channels(); + subK_.push_back( + channel * conf.filter_size() * + (conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) / + conf.groups()); /* Consistent caffe mode for multiple input */ caffeMode_ = conf.caffe_mode(); } @@ -116,11 +118,11 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image, imgSizeH_[inIdx], imgSizeW_[inIdx], channel, + filterSizeY_[inIdx], filterSize_[inIdx], - filterSize_[inIdx], + strideY_[inIdx], stride_[inIdx], - stride_[inIdx], - padding_[inIdx], + paddingY_[inIdx], padding_[inIdx], outputH_[inIdx], outputW_[inIdx]); @@ -208,11 +210,11 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, imgSizeH_[inpIdx], imgSizeW_[inpIdx], channel, + filterSizeY_[inpIdx], filterSize_[inpIdx], - filterSize_[inpIdx], - stride_[inpIdx], + strideY_[inpIdx], stride_[inpIdx], - padding_[inpIdx], + paddingY_[inpIdx], padding_[inpIdx], outputH_[inpIdx], outputW_[inpIdx], diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/gserver/layers/MaxOutLayer.cpp index a3de069bf7a6c9217e4adfeb2e65409955cc569c..b7f1b98041355624edbc1b480868079887264467 100644 --- a/paddle/gserver/layers/MaxOutLayer.cpp +++ b/paddle/gserver/layers/MaxOutLayer.cpp @@ -25,10 +25,10 @@ size_t MaxOutLayer::getSize() { imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight(); imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth(); if (imgSizeH_ == 0) { - imgSizeH_ = maxoutConf.img_size_y(); + imgSizeH_ = maxoutConf.image_conf().img_size_y(); } if (imgSizeW_ == 0) { - imgSizeW_ = maxoutConf.img_size_x(); + imgSizeW_ = maxoutConf.image_conf().img_size(); } featLen_ = imgSizeH_ * imgSizeW_; @@ -50,7 +50,7 @@ bool MaxOutLayer::init(const LayerMap& layerMap, const MaxOutConfig& conf = config_.inputs(0).maxout_conf(); groups_ = conf.groups(); - channels_ = conf.channels(); + channels_ = conf.image_conf().channels(); CHECK_EQ(channels_ % groups_, 0UL); outputChannels_ = channels_ / groups_; diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp index 7f6ffe229842113869b4f2d61d59cdc0f4e1ddf8..445a1a0c52ed65a6321a265b158388f2d59e4722 100644 --- a/paddle/gserver/layers/NormLayer.cpp +++ b/paddle/gserver/layers/NormLayer.cpp @@ -48,6 +48,9 @@ bool ResponseNormLayer::init(const LayerMap& layerMap, outputX_ = conf.output_x(); imgSize_ = conf.img_size(); denoms_ = NULL; + + outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x(); + imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); return true; } diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h index 9e848e5268d6b4b69f24802b66c5fed7cc1bf9e4..fcc57849d6b86df8f175184451a0fd459ce9ec28 100644 --- a/paddle/gserver/layers/NormLayer.h +++ b/paddle/gserver/layers/NormLayer.h @@ -49,7 +49,7 @@ public: */ class ResponseNormLayer : public NormLayer { protected: - size_t channels_, size_, outputX_, imgSize_; + size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_; float scale_, pow_; MatrixPtr denoms_; diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp index 6ac468e6fc7c2962beaf8c28192890634340b296..da36cc2c9913796b65c451a5c4928143168a1104 100644 --- a/paddle/gserver/layers/NormProjectionLayer.cpp +++ b/paddle/gserver/layers/NormProjectionLayer.cpp @@ -23,7 +23,7 @@ size_t CMRProjectionNormLayer::getSize() { imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight(); imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth(); if (imgSizeH_ == 0) { - imgSizeH_ = imgSize_; + imgSizeH_ = imgSizeY_; } if (imgSizeW_ == 0) { imgSizeW_ = imgSize_; diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp index 9609919695853552ed54d8d55e8a669002fa3147..dce660a5bca792e99a16e187aaa4aa10187830ac 100644 --- a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp +++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp @@ -56,14 +56,14 @@ ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW, size_t SpatialPyramidPoolLayer::getSize() { CHECK_EQ(inputLayers_.size(), 1UL); size_t layerSize = 0; - const SppConfig& sppConf = config_.inputs(0).spp_conf(); + const ImageConfig& conf = config_.inputs(0).spp_conf().image_conf(); imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight(); imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth(); if (imgSizeH_ == 0) { - imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_; + imgSizeH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); } if (imgSizeW_ == 0) { - imgSizeW_ = sppConf.img_size(); + imgSizeW_ = conf.img_size(); } size_t outputH = 1; @@ -82,9 +82,10 @@ bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap, pyramidHeight_ = sppConf.pyramid_height(); poolType_ = sppConf.pool_type(); - channels_ = sppConf.channels(); - imgSizeW_ = sppConf.img_size(); - imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_; + const ImageConfig& imageConf = sppConf.image_conf(); + channels_ = imageConf.channels(); + imgSizeW_ = imageConf.img_size(); + imgSizeH_ = imageConf.has_img_size_y() ? imageConf.img_size_y() : imgSizeW_; poolProjections_.reserve(pyramidHeight_); projCol_.reserve(pyramidHeight_); projOutput_.resize(pyramidHeight_); diff --git a/paddle/gserver/tests/img_pool_a.conf b/paddle/gserver/tests/img_pool_a.conf index 5938e7611201c9a4e3b44ca8aae2f39a80b1ff3b..9bd046b533de8200e6c945d1752ce240508b6338 100644 --- a/paddle/gserver/tests/img_pool_a.conf +++ b/paddle/gserver/tests/img_pool_a.conf @@ -28,7 +28,6 @@ maxpool = img_pool_layer(input=conv, stride_y=2, padding=1, padding_y=2, - img_width=16, pool_type=MaxPooling(), ) avgpool = img_pool_layer(input=conv, @@ -39,7 +38,6 @@ avgpool = img_pool_layer(input=conv, stride_y=2, padding=1, padding_y=2, - img_width=16, pool_type=AvgPooling(), ) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 55446ea824e30c08fa1fb2beb7e557be21565aea..099e96aa6c7439adc76248fcbb88cf24e7496ed4 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -202,16 +202,15 @@ void testProjectionConv(size_t groups) { conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS); conf.set_output_size(output_x * output_y * NUM_FILTERS); - testProjectionGrad( - conf, - INPUT_DATA, - /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y - / groups, - /* batchSize */ 100, - true, - false, - NUM_FILTERS, - true); + testProjectionGrad(conf, + INPUT_DATA, + /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * + FILTER_SIZE_Y / groups, + /* batchSize */ 100, + true, + false, + NUM_FILTERS, + true); } #ifndef PADDLE_ONLY_CPU @@ -229,9 +228,10 @@ TEST(Layer, BilinearInterpLayer) { LayerInputConfig* input = config.layerConfig.add_inputs(); BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf(); - bilinear->set_img_size_x(32); - bilinear->set_img_size_y(32); - bilinear->set_num_channels(4); + ImageConfig* image = bilinear->mutable_image_conf(); + image->set_img_size(32); + image->set_img_size_y(32); + image->set_channels(4); for (auto useGpu : {false, true}) { for (auto outSize : {32, 64}) { @@ -354,7 +354,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) { config.layerConfig.set_partial_sum(1); config.layerConfig.set_shared_biases(true); - config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 288}); + config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288}); LayerInputConfig* input = config.layerConfig.add_inputs(); ConvConfig* conv = input->mutable_conv_conf(); conv->set_filter_size(2); @@ -367,12 +367,18 @@ void testConvLayer(const string& type, bool trans, bool useGpu) { conv->set_groups(1); conv->set_filter_channels(conv->channels() / conv->groups()); conv->set_img_size(16); + conv->set_img_size_y(8); conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(), conv->padding(), conv->stride(), /* caffeMode */ true)); - config.layerConfig.set_size(conv->output_x() * conv->output_x() * + conv->set_output_y(outputSize(conv->img_size_y(), + conv->filter_size_y(), + conv->padding_y(), + conv->stride_y(), + /* caffeMode */ true)); + config.layerConfig.set_size(conv->output_x() * conv->output_y() * config.layerConfig.num_filters()); testLayerGrad(config, "conv", 100, trans, useGpu); @@ -472,10 +478,11 @@ TEST(Layer, maxoutLayer) { config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0}); LayerInputConfig* input = config.layerConfig.add_inputs(); MaxOutConfig* maxout = input->mutable_maxout_conf(); + ImageConfig* image = maxout->mutable_image_conf(); - maxout->set_img_size_x(32); - maxout->set_img_size_y(32); - maxout->set_channels(4); + image->set_img_size(32); + image->set_img_size_y(32); + image->set_channels(4); maxout->set_groups(2); for (auto useGpu : {false, true}) { @@ -987,7 +994,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) { config.layerConfig.set_type("norm"); config.layerConfig.set_active_type("relu"); - config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0}); + config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0}); LayerInputConfig* input = config.layerConfig.add_inputs(); NormConfig* norm = input->mutable_norm_conf(); norm->set_norm_type(normType); @@ -997,7 +1004,9 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) { norm->set_pow(0.75); norm->set_blocked(0); norm->set_img_size(14); + norm->set_img_size_y(7); norm->set_output_x(norm->img_size()); + norm->set_output_y(norm->img_size_y()); if (norm->norm_type() == "cmrnorm" || norm->norm_type() == "cmrnorm-projection") { norm->set_scale(norm->scale() / norm->size()); @@ -1005,7 +1014,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) { norm->set_scale(norm->scale() / (norm->size() * norm->size())); } - config.layerConfig.set_size(norm->output_x() * norm->output_x() * + config.layerConfig.set_size(norm->output_x() * norm->output_y() * norm->channels()); config.biasSize = 0; @@ -1106,11 +1115,12 @@ void testSppLayer(const string& poolType, SppConfig* sppConfig = input->mutable_spp_conf(); sppConfig->set_pool_type(poolType); sppConfig->set_pyramid_height(pyramidHeight); - sppConfig->set_channels(16); - sppConfig->set_img_size(10); - sppConfig->set_img_size_y(20); + ImageConfig* imageConfig = sppConfig->mutable_image_conf(); + imageConfig->set_channels(16); + imageConfig->set_img_size(10); + imageConfig->set_img_size_y(20); int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1); - config.layerConfig.set_size(outputSize * sppConfig->channels()); + config.layerConfig.set_size(outputSize * imageConfig->channels()); testLayerGrad(config, "spp", 100, trans, useGpu); } @@ -1420,13 +1430,15 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) { TestConfig config; const int CHANNELS = 10; const int IMG_SIZE = 16; + const int IMG_SIZE_Y = 8; + size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y; config.layerConfig.set_type(type); - config.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE); + config.layerConfig.set_size(size); config.layerConfig.set_active_type("sigmoid"); config.biasSize = CHANNELS; config.inputDefs.push_back({INPUT_DATA, "layer_0", - /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS, + /* dim= */ size, /* paraSize= */ CHANNELS}); config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS}); @@ -1441,6 +1453,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) { ImageConfig* img_conf = input->mutable_image_conf(); img_conf->set_channels(CHANNELS); img_conf->set_img_size(IMG_SIZE); + img_conf->set_img_size_y(IMG_SIZE_Y); testLayerGrad(config, "batch_norm", @@ -1467,6 +1480,7 @@ TEST(Operator, conv) { const int FILTER_SIZE_Y = 3; const int CHANNELS = 3; const int IMAGE_SIZE = 16; + const int IMAGE_SIZE_Y = 8; OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs(); operatorConf.set_type("conv"); ConvConfig* conv = operatorConf.mutable_conv_conf(); @@ -1481,19 +1495,22 @@ TEST(Operator, conv) { conv->set_groups(1); conv->set_filter_channels(conv->channels() / conv->groups()); conv->set_img_size(IMAGE_SIZE); - int output_x = outputSize(conv->img_size(), - conv->filter_size(), - conv->padding(), - conv->stride(), - /* caffeMode */ true); - conv->set_output_x(output_x); - config.layerConfig.set_size(output_x * output_x * - config.layerConfig.num_filters()); - config.layerConfig.set_size(conv->output_x() * conv->output_x() * + conv->set_img_size_y(IMAGE_SIZE_Y); + conv->set_output_x(outputSize(conv->img_size(), + conv->filter_size(), + conv->padding(), + conv->stride(), + /* caffeMode */ true)); + conv->set_output_y(outputSize(conv->img_size_y(), + conv->filter_size_y(), + conv->padding_y(), + conv->stride_y(), + /* caffeMode */ true)); + config.layerConfig.set_size(conv->output_x() * conv->output_y() * NUM_FILTERS); config.inputDefs.push_back( - {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE * CHANNELS, 0}); + {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0}); config.inputDefs.push_back( {INPUT_DATA, "layer_1", diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 81d53f065b84b2699141fc599b9efba794bbd25a..0f414b4463b6993ca7bf0bc1eafebbbf9f1a8e00 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -225,6 +225,8 @@ void Argument::resizeAndCopyFrom(const Argument& src, } resizeAndCopy(udp, src.udp, useGpu, stream); resizeAndCopy(strs, src.strs, useGpu, stream); + frameWidth = src.frameWidth; + frameHeight = src.frameHeight; } int32_t Argument::resizeAndCopyFrom(const Argument& src, diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf index 664e18cb986811ffca2a4865c5f50045ace122e1..2a4548896ffe0770f48b6c375c41eaf452b19366 100644 --- a/paddle/trainer/tests/test_config.conf +++ b/paddle/trainer/tests/test_config.conf @@ -59,7 +59,6 @@ pool = img_pool_layer(input=fc2, padding_y=2, stride=2, stride_y=3, - img_width=3, pool_type=CudnnAvgPooling()) concat = concat_layer(input=[fc3, fc4]) diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4 index 68a5eb9dd2231b47cc8f83696ab18fdb907c44c0..4772f6b8d662bebf22cb781c9999af8bebbc7abe 100644 --- a/proto/ModelConfig.proto.m4 +++ b/proto/ModelConfig.proto.m4 @@ -77,6 +77,12 @@ message ConvConfig { required uint32 filter_size_y = 10; required uint32 padding_y = 11; required uint32 stride_y = 12; + + // if not set, use output_x + optional uint32 output_y = 13; + + // if not set, use img_size + optional uint32 img_size_y = 14; } message PoolConfig { @@ -122,11 +128,9 @@ message PoolConfig { } message SppConfig { - required string pool_type = 1; - required uint32 pyramid_height = 2; - required uint32 channels = 3; - required uint32 img_size = 4; - optional uint32 img_size_y = 5; + required ImageConfig image_conf = 1; + required string pool_type = 2; + required uint32 pyramid_height = 3; } message NormConfig { @@ -156,6 +160,12 @@ message NormConfig { // fixed window: shared a fixed window for each value // sliding window: have a different window for each value optional bool blocked = 8; + + // if not set, use output_x + optional uint32 output_y = 9; + + // if not set, use img_size + optional uint32 img_size_y = 10; } message BlockExpandConfig { @@ -180,12 +190,8 @@ message BlockExpandConfig { } message MaxOutConfig { - required uint32 channels = 1; + required ImageConfig image_conf = 1; required uint32 groups = 2; - - // The size of input feature map. - required uint32 img_size_x = 3; - required uint32 img_size_y = 4; } message ProjectionConfig { @@ -226,12 +232,10 @@ message OperatorConfig { message BilinearInterpConfig { // The size of input feature map. - optional uint32 img_size_x = 1; - optional uint32 img_size_y = 2; + required ImageConfig image_conf = 1; // The size of output feature map. - required uint32 out_size_x = 3; - required uint32 out_size_y = 4; - required uint32 num_channels = 5; + required uint32 out_size_x = 2; + required uint32 out_size_y = 3; } message ImageConfig { @@ -241,6 +245,7 @@ message ImageConfig { // The size of input feature map. required uint32 img_size = 8; + required uint32 img_size_y = 9; } message LayerInputConfig { @@ -413,7 +418,10 @@ sinclude(`ModelConfigLayer.proto.m4') // string type is used for flexibility: different types can be converted // to string and reinterpreted in the user's own layer implementation. optional string user_arg = 49; - + + // to indicate rectangle image data + optional uint64 height = 50; + optional uint64 width = 51; } message EvaluatorConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 10467e331d0915781ef832feab622196fcea44ab..a19a9a7cd658ce50d17c40c030bdfb8585e1460c 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -138,7 +138,14 @@ def init_config_environment( g_root_submodel=None, g_submodel_map={}, g_submodel_stack=[], - g_add_submodel_suffix=False, ): + g_add_submodel_suffix=False, + + # Whether current layer needs to pass the image height and width. + # Default value is true, but if it encounters recurrent_layer_group, + # it will be false. The reason is that image is converted to be sequence, + # image height will be sequence length, and image width will be feature + # length of each timestep. + g_pass_height_width=True, ): for k, v in locals().iteritems(): globals()[k] = copy.deepcopy(v) @@ -686,9 +693,9 @@ class ConvProjection(Projection): parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf, num_filters) - # TODO: support rectangle input - self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x - **2) * num_filters + self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \ + self.proj_conf.conv_conf.output_y * \ + num_filters def calc_output_size(self, input_layer_config): return self.proj_conf.output_size @@ -764,8 +771,9 @@ class ConvOperator(Operator): parse_conv(conv_conf, MakeLayerNameInSubmodel(input_layer_names[0]), self.operator_conf.conv_conf, num_filters) - self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x - **2) * num_filters + self.operator_conf.output_size = self.operator_conf.conv_conf.output_x * \ + self.operator_conf.conv_conf.output_y * \ + num_filters config_assert(len(input_layer_names) == 2, "Conv is binary operator") @@ -800,14 +808,12 @@ class Conv(Cfg): config_assert(output_x <= 0) -# please refer to the comments in proto/ModelConfig.proto @config_class class BilinearInterp(Cfg): - def __init__(self, out_size_x=None, out_size_y=None, num_channels=None): + def __init__(self, out_size_x=None, out_size_y=None, channels=None): self.add_keys(locals()) -# please refer to the comments in proto/ModelConfig.proto @config_class class Pool(Cfg): def __init__( @@ -825,14 +831,12 @@ class Pool(Cfg): self.add_keys(locals()) -# please refer to the comments in proto/ModelConfig.proto @config_class class SpatialPyramidPool(Cfg): - def __init__(self, pool_type, pyramid_height, channels, img_width=None): + def __init__(self, pool_type, pyramid_height, channels): self.add_keys(locals()) -# please refer to the comments in proto/ModelConfig.proto @config_class class Norm(Cfg): def __init__(self, @@ -847,7 +851,6 @@ class Norm(Cfg): self.add_keys(locals()) -# please refer to the comments in proto/ModelConfig.proto @config_class class Image(Cfg): def __init__(self, channels, img_size=None): @@ -1054,18 +1057,8 @@ def TestData(data_config, async_load_data=None): g_config.test_data_config.async_load_data = async_load_data -def parse_bilinear(bilinear, input_layer_name, bilinear_conf): - bilinear_conf.out_size_x = bilinear.out_size_x - bilinear_conf.out_size_y = bilinear.out_size_y - bilinear_conf.num_channels = bilinear.num_channels - - -''' -caffe_mode: compute the output size using floor instead of ceil, - which is consistent of caffe and CuDNN's convention. -''' - - +#caffe_mode: compute the output size using floor instead of ceil, +# which is consistent of caffe and CuDNN's convention. def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode): output = (2 * padding + img_size - filter_size) / float(stride) if caffe_mode: @@ -1074,20 +1067,34 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode): return 1 + int(math.ceil(output)) -''' -calcualte image_size based on output_size for convolution. -It is the reverse function of cnn_output_size -''' - - +#calcualte image_size based on output_size for de-convolution (ConvTransLayer). +#It is the reverse function of cnn_output_size def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode): - if caffe_mode: - img_size = (output_size - 1) * stride + filter_size - 2 * padding - else: - img_size = (output_size - 2) * stride + filter_size - 2 * padding + 1 + img_size = (output_size - 1) * stride + filter_size - 2 * padding + if not caffe_mode: + img_size = img_size + 1 return img_size +def get_img_size(input_layer_name, channels): + input = g_layer_map[input_layer_name] + img_pixels = input.size / channels + img_size = input.width if input.width > 0 else int(img_pixels**0.5) + img_size_y = input.height if input.height > 0 else int(img_pixels / + img_size) + config_assert( + img_size * img_size_y == img_pixels, + "Input layer %s: Incorrect input image size %d * %d for input image pixels %d" + % (input_layer_name, img_size, img_size_y, img_pixels)) + return img_size, img_size_y + + +def parse_bilinear(bilinear, input_layer_name, bilinear_conf): + parse_image(bilinear, input_layer_name, bilinear_conf.image_conf) + bilinear_conf.out_size_x = bilinear.out_size_x + bilinear_conf.out_size_y = bilinear.out_size_y + + def parse_pool(pool, input_layer_name, pool_conf): pool_conf.pool_type = pool.pool_type config_assert(pool.pool_type in [ @@ -1103,14 +1110,8 @@ def parse_pool(pool, input_layer_name, pool_conf): pool_conf.size_y = default(pool.size_y, pool_conf.size_x) pool_conf.stride_y = default(pool.stride_y, pool_conf.stride) - img_pixels = g_layer_map[input_layer_name].size / pool.channels - # the img_width may be removed, - # and it can be calculated automatically later. - pool_conf.img_size = default(pool.img_width, int(img_pixels**0.5)) - pool_conf.img_size_y = img_pixels / pool_conf.img_size - config_assert(pool_conf.img_size * pool_conf.img_size_y == img_pixels, - "Incorrect input image size %d for input image pixels %d" % - (pool_conf.img_size, img_pixels)) + pool_conf.img_size, pool_conf.img_size_y = \ + get_img_size(input_layer_name, pool.channels) config_assert(not pool.start, "start is deprecated in pooling.") @@ -1126,29 +1127,18 @@ def parse_pool(pool, input_layer_name, pool_conf): def parse_spp(spp, input_layer_name, spp_conf): + parse_image(spp, input_layer_name, spp_conf.image_conf) spp_conf.pool_type = spp.pool_type config_assert(spp.pool_type in ['max-projection', 'avg-projection'], "pool-type %s is not in " "['max-projection', 'avg-projection']" % spp.pool_type) spp_conf.pyramid_height = spp.pyramid_height - spp_conf.channels = spp.channels - - img_pixels = g_layer_map[input_layer_name].size / spp_conf.channels - - spp_conf.img_size = default(spp.img_width, int(img_pixels**0.5)) - spp_conf.img_size_y = img_pixels / spp_conf.img_size - config_assert(spp_conf.img_size * spp_conf.img_size_y == img_pixels, - "Incorrect input image size %d for input image pixels %d" % - (spp_conf.img_size, img_pixels)) def parse_image(image, input_layer_name, image_conf): image_conf.channels = image.channels - image_pixels = g_layer_map[input_layer_name].size / image_conf.channels - image_conf.img_size = int(image_pixels**0.5) - config_assert((image_conf.img_size**2) == image_pixels, - "Incorrect input image size %d for input image pixels %d" % - (image_conf.img_size, image_pixels)) + image_conf.img_size, image_conf.img_size_y = \ + get_img_size(input_layer_name, image_conf.channels) def parse_norm(norm, input_layer_name, norm_conf): @@ -1162,24 +1152,18 @@ def parse_norm(norm, input_layer_name, norm_conf): norm_conf.pow = norm.pow norm_conf.blocked = norm.blocked - img_pixels = g_layer_map[input_layer_name].size / norm.channels - norm_conf.img_size = int(img_pixels**0.5) - config_assert((norm_conf.img_size**2) == img_pixels, - "Incorrect input image size %d for input image pixels %d" % - (norm_conf.img_size, img_pixels)) + norm_conf.img_size, norm_conf.img_size_y = \ + get_img_size(input_layer_name, norm.channels) norm_conf.output_x = norm_conf.img_size + norm_conf.output_y = norm_conf.img_size_y if norm.norm_type in ['cmrnorm-projection']: norm_conf.scale /= norm.size else: norm_conf.scale /= norm.size**2 -''' -caffe_mode: compute the output size using floor instead of ceil, - which is consistent of caffe and CuDNN's convention. -''' - - +#caffe_mode: compute the output size using floor instead of ceil, +# which is consistent of caffe and CuDNN's convention. def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False): conv_conf.filter_size = conv.filter_size conv_conf.filter_size_y = conv.filter_size_y @@ -1193,33 +1177,24 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False): if not trans: conv_conf.filter_channels = conv.channels / conv.groups - - img_pixels = g_layer_map[input_layer_name].size / conv.channels - print('channels=%d size=%d' % (conv.channels, - g_layer_map[input_layer_name].size)) - conv_conf.img_size = int(img_pixels**0.5) - config_assert((conv_conf.img_size**2) == img_pixels, ( - "Input layer %s: Incorrect input image size %d for input " + - "image pixels %d") % - (input_layer_name, conv_conf.img_size, img_pixels)) - + conv_conf.img_size, conv_conf.img_size_y = \ + get_img_size(input_layer_name, conv.channels) conv_conf.output_x = cnn_output_size( conv_conf.img_size, conv_conf.filter_size, conv_conf.padding, conv_conf.stride, conv_conf.caffe_mode) + conv_conf.output_y = cnn_output_size( + conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y, + conv_conf.stride_y, conv_conf.caffe_mode) else: conv_conf.filter_channels = num_filters / conv.groups - - outputSize = g_layer_map[input_layer_name].size / conv.channels - print('channels=%d size=%d' % (conv.channels, - g_layer_map[input_layer_name].size)) - conv_conf.output_x = int(outputSize**0.5) - config_assert((conv_conf.output_x**2) == outputSize, ( - "Input layer %s: Incorrect input image size %d for input " + - "image pixels %d") % - (input_layer_name, conv_conf.output_x, outputSize)) + conv_conf.output_x, conv_conf.output_y = \ + get_img_size(input_layer_name, conv.channels) conv_conf.img_size = cnn_image_size( conv_conf.output_x, conv_conf.filter_size, conv_conf.padding, conv_conf.stride, conv_conf.caffe_mode) + conv_conf.img_size_y = cnn_image_size( + conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y, + conv_conf.stride_y, conv_conf.caffe_mode) def parse_block_expand(block_expand, input_layer_name, block_expand_conf): @@ -1248,10 +1223,8 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf): def parse_maxout(maxout, input_layer_name, maxout_conf): - maxout_conf.channels = maxout.channels + parse_image(maxout, input_layer_name, maxout_conf.image_conf) maxout_conf.groups = maxout.groups - maxout_conf.img_size_x = maxout.img_size_x - maxout_conf.img_size_y = maxout.img_size_y # Define an evaluator @@ -1378,6 +1351,12 @@ class LayerBase(object): g_current_submodel.layer_names.append(self.config.name) + if self.config.type != 'data' and g_pass_height_width: + height = self.get_input_layer(0).height + width = self.get_input_layer(0).width + if height and width: + self.set_layer_height_width(height, width) + def get_input_layer(self, input_index): return g_layer_map[self.config.inputs[input_index].input_layer_name] @@ -1495,6 +1474,23 @@ class LayerBase(object): 'Different inputs result in' + 'different layer size at layer %s' % self.config.name) + def set_layer_height_width(self, height, width): + self.config.height = height + self.config.width = width + + def set_cnn_layer(self, + input_layer_name, + height, + width, + channels, + is_print=True): + size = height * width * channels + self.set_layer_size(size) + self.set_layer_height_width(height, width) + if is_print: + print("output for %s: c = %d, h = %d, w = %d, size = %d" % + (input_layer_name, channels, height, width, size)) + @config_layer('multi_class_cross_entropy_with_selfnorm') class MultiClassCrossEntropySelfNormCostLayer(LayerBase): @@ -1584,9 +1580,11 @@ class PrintLayer(LayerBase): @config_layer('data') class DataLayer(LayerBase): - def __init__(self, name, size, device=None): + def __init__(self, name, size, height=None, width=None, device=None): super(DataLayer, self).__init__( name, 'data', size, inputs=[], device=device) + if height and width: + self.set_layer_height_width(height, width) ''' @@ -1685,14 +1683,13 @@ class ConvLayerBase(LayerBase): for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) - parse_conv(self.inputs[input_index].conv, input_layer.name, - self.config.inputs[input_index].conv_conf, num_filters) conv_conf = self.config.inputs[input_index].conv_conf + parse_conv(self.inputs[input_index].conv, input_layer.name, + conv_conf, num_filters) psize = self.calc_parameter_size(conv_conf) - print("output size for %s is %d " % (name, conv_conf.output_x)) self.create_input_parameter(input_index, psize) - self.set_layer_size( - (conv_conf.output_x**2) * self.config.num_filters) + self.set_cnn_layer(name, conv_conf.output_y, conv_conf.output_x, + self.config.num_filters) psize = self.config.size if shared_biases: @@ -1779,10 +1776,11 @@ class NormLayer(LayerBase): name, 'norm', 0, inputs=inputs, device=device) for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) - parse_norm(self.inputs[input_index].norm, input_layer.name, - self.config.inputs[input_index].norm_conf) norm_conf = self.config.inputs[input_index].norm_conf - self.set_layer_size((norm_conf.output_x**2) * norm_conf.channels) + parse_norm(self.inputs[input_index].norm, input_layer.name, + norm_conf) + self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x, + norm_conf.channels, False) @config_layer('pool') @@ -1792,13 +1790,11 @@ class PoolLayer(LayerBase): name, 'pool', 0, inputs=inputs, device=device) for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) - parse_pool(self.inputs[input_index].pool, input_layer.name, - self.config.inputs[input_index].pool_conf) pool_conf = self.config.inputs[input_index].pool_conf - print("output size for %s is %d*%d " % (name, pool_conf.output_y, - pool_conf.output_x)) - self.set_layer_size( - (pool_conf.output_x * pool_conf.output_y) * pool_conf.channels) + parse_pool(self.inputs[input_index].pool, input_layer.name, + pool_conf) + self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x, + pool_conf.channels) @config_layer('spp') @@ -1808,12 +1804,10 @@ class SpatialPyramidPoolLayer(LayerBase): name, 'spp', 0, inputs=inputs, device=device) for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) - parse_spp(self.inputs[input_index].spp, input_layer.name, - self.config.inputs[input_index].spp_conf) spp_conf = self.config.inputs[input_index].spp_conf - output_size = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1) - print("output size for %s is %d " % (name, output_size)) - self.set_layer_size(output_size * spp_conf.channels) + parse_spp(self.inputs[input_index].spp, input_layer.name, spp_conf) + output_x = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1) + self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels) @config_layer('batch_norm') @@ -1875,10 +1869,10 @@ class BatchNormLayer(LayerBase): self.config.moving_average_fraction = moving_average_fraction input_layer = self.get_input_layer(0) - parse_image(self.inputs[0].image, input_layer.name, - self.config.inputs[0].image_conf) image_conf = self.config.inputs[0].image_conf - self.set_layer_size((image_conf.img_size**2) * image_conf.channels) + parse_image(self.inputs[0].image, input_layer.name, image_conf) + self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size, + image_conf.channels) psize = self.calc_parameter_size(image_conf) dims = [1, psize] @@ -1936,11 +1930,11 @@ class MaxOutLayer(LayerBase): super(MaxOutLayer, self).__init__( name, 'maxout', 0, inputs=inputs, **xargs) input_layer = self.get_input_layer(0) - parse_maxout(self.inputs[0].maxout, input_layer.name, - self.config.inputs[0].maxout_conf) maxout_conf = self.config.inputs[0].maxout_conf - self.set_layer_size(g_layer_map[input_layer.name].size / - maxout_conf.groups) + parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf) + out_channels = maxout_conf.image_conf.channels / maxout_conf.groups + self.set_cnn_layer(name, g_layer_map[input_layer.name].height, + g_layer_map[input_layer.name].width, out_channels) # key: cost type @@ -2520,11 +2514,10 @@ class BilinearInterpLayer(LayerBase): super(BilinearInterpLayer, self).__init__( name, 'bilinear_interp', 0, inputs=inputs, **xargs) input_layer = self.get_input_layer(0) - parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name, - self.config.inputs[0].bilinear_interp_conf) - conf = self.inputs[0].bilinear_interp - self.set_layer_size(conf.out_size_x * conf.out_size_y * - conf.num_channels) + conf = self.config.inputs[0].bilinear_interp_conf + parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name, conf) + self.set_cnn_layer(name, conf.out_size_y, conf.out_size_x, + conf.image_conf.channels) @config_layer('sum_to_one_norm') @@ -2997,6 +2990,8 @@ class CTCLayer(LayerBase): @config_layer('recurrent_layer_group') class RecurrentLayerGroup(LayerBase): def __init__(self, name, device=None): + global g_pass_height_width + g_pass_height_width = False super(RecurrentLayerGroup, self).__init__( name, 'recurrent_layer_group', 0, inputs=[], device=device) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 1d56a2aaf62e847aadd5f48b10ab67e9856f9bb9..7724599b00c7d93e0c0383280375b788e9687076 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -766,7 +766,7 @@ def mixed_layer(size=0, @layer_support() -def data_layer(name, size, layer_attr=None): +def data_layer(name, size, height=None, width=None, layer_attr=None): """ Define DataLayer For NeuralNetwork. @@ -781,6 +781,10 @@ def data_layer(name, size, layer_attr=None): :type name: basestring :param size: Size of this data layer. :type size: int + :param height: Height of this data layer, used for image + :type size: int|None + :param width: Width of this data layer, used for image + :type size: int|None :param layer_attr: Extra Layer Attribute. :type layer_attr: ExtraLayerAttribute. :return: LayerOutput object. @@ -790,6 +794,8 @@ def data_layer(name, size, layer_attr=None): type=LayerType.DATA, name=name, size=size, + height=height, + width=width, **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput(name, LayerType.DATA, size=size) @@ -1483,7 +1489,7 @@ def bilinear_interp_layer(input, bilinear_interp=BilinearInterp( out_size_x=out_size_x, out_size_y=out_size_y, - num_channels=num_channels)), + channels=num_channels)), type=LayerType.BILINEAR_INTERP_LAYER, **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput( @@ -1872,7 +1878,7 @@ def img_conv_layer(input, param_attr.attr["initial_std"] = init_w param_attr.attr["initial_strategy"] = 0 param_attr.attr["initial_smart"] = False - + if layer_type: if trans: assert layer_type in ["exconvt"] @@ -1923,8 +1929,7 @@ def img_pool_layer(input, layer_attr=None, pool_size_y=None, stride_y=None, - padding_y=None, - img_width=None): + padding_y=None): """ Image pooling Layer. @@ -1955,9 +1960,6 @@ def img_pool_layer(input, :type stride_y: int|None :param layer_attr: Extra Layer attribute. :type layer_attr: ExtraLayerAttribute - :param img_width: the width of input feature map. If it is None, the input feature - map should be square. - :type img_width: int|None :return: LayerOutput object. :rtype: LayerOutput """ @@ -1993,8 +1995,7 @@ def img_pool_layer(input, padding=padding, size_y=pool_size_y, stride_y=stride_y, - padding_y=padding_y, - img_width=img_width)) + padding_y=padding_y)) ], **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput( @@ -2012,7 +2013,6 @@ def spp_layer(input, num_channels=None, pool_type=None, pyramid_height=None, - img_width=None, layer_attr=None): """ Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition. @@ -2029,9 +2029,6 @@ def spp_layer(input, :type scale: BasePoolingType :param pyramid_height: pyramid height. :type pyramid_height: int - :param img_width: the width of input feature map. If it is None, the input feature - map should be square. - :type img_width: int|None :param layer_attr: Extra Layer Attribute. :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. @@ -2058,8 +2055,7 @@ def spp_layer(input, spp=SpatialPyramidPool( pool_type=type_name, channels=num_channels, - pyramid_height=pyramid_height, - img_width=img_width)), + pyramid_height=pyramid_height)), **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput( name, diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr index 1f262af21126c17eb133b92c84a1ae3bb280a1d6..1a577b8d9b1e1915236ba6afcfa97040d70c707a 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr @@ -26,11 +26,15 @@ layers { filter_size_y: 32 padding_y: 1 stride_y: 1 + output_y: 227 + img_size_y: 256 } } bias_parameter_name: "___conv_0__.wbias" num_filters: 64 shared_biases: true + height: 227 + width: 227 } layers { name: "__batch_norm_0__" @@ -43,6 +47,7 @@ layers { image_conf { channels: 64 img_size: 227 + img_size_y: 227 } } inputs { @@ -55,6 +60,8 @@ layers { } bias_parameter_name: "___batch_norm_0__.wbias" moving_average_fraction: 0.9 + height: 227 + width: 227 } layers { name: "__crmnorm_0__" @@ -72,8 +79,12 @@ layers { output_x: 227 img_size: 227 blocked: false + output_y: 227 + img_size_y: 227 } } + height: 227 + width: 227 } layers { name: "__pool_0__" @@ -97,6 +108,8 @@ layers { padding_y: 0 } } + height: 196 + width: 196 } parameters { name: "___conv_0__.w0" diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr index 38346354080b02bebd937fd998fd3c63c8030346..cd310bd13b39aca57d7a1f38ac2a8966c706b60a 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr @@ -26,6 +26,8 @@ layers { filter_size_y: 32 padding_y: 1 stride_y: 1 + output_y: 227 + img_size_y: 256 } } bias_parameter_name: "___conv_0__.wbias" @@ -43,6 +45,7 @@ layers { image_conf { channels: 64 img_size: 256 + img_size_y: 256 } } inputs { @@ -55,6 +58,8 @@ layers { } bias_parameter_name: "___batch_norm_0__.wbias" moving_average_fraction: 0.9 + height: 256 + width: 256 } layers { name: "__crmnorm_0__" @@ -72,8 +77,12 @@ layers { output_x: 256 img_size: 256 blocked: false + output_y: 256 + img_size_y: 256 } } + height: 256 + width: 256 } layers { name: "__pool_0__" @@ -97,6 +106,8 @@ layers { padding_y: 0 } } + height: 225 + width: 225 } parameters { name: "___conv_0__.w0" diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr index 2b3951c242411e0c0990a52bcb2ae6b1723a9367..2943ab130bd7d6f3b78ea611f1c35850ccaf5e92 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr @@ -177,6 +177,8 @@ layers { filter_size_y: 3 padding_y: 0 stride_y: 1 + output_y: 30 + img_size_y: 32 } num_filters: 64 } diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr index 13d0d477eb58f6da887d0ad9c683caef37e00010..9fae596f281d44dc24c45cb3c750233266e95948 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr @@ -26,11 +26,15 @@ layers { filter_size_y: 3 padding_y: 1 stride_y: 1 + output_y: 48 + img_size_y: 48 } } bias_parameter_name: "___conv_0__.wbias" num_filters: 16 shared_biases: true + height: 48 + width: 48 } layers { name: "__bilinear_interp_layer_0__" @@ -40,11 +44,17 @@ layers { inputs { input_layer_name: "__conv_0__" bilinear_interp_conf { + image_conf { + channels: 16 + img_size: 48 + img_size_y: 48 + } out_size_x: 64 out_size_y: 64 - num_channels: 16 } } + height: 64 + width: 64 } layers { name: "__pool_0__" @@ -55,19 +65,21 @@ layers { input_layer_name: "__bilinear_interp_layer_0__" pool_conf { pool_type: "max-projection" - channels: 4 + channels: 16 size_x: 2 stride: 2 - output_x: 64 - img_size: 128 + output_x: 32 + img_size: 64 padding: 0 size_y: 2 stride_y: 2 - output_y: 64 - img_size_y: 128 + output_y: 32 + img_size_y: 64 padding_y: 0 } } + height: 32 + width: 32 } layers { name: "__fc_layer_0__" @@ -78,6 +90,8 @@ layers { input_layer_name: "__pool_0__" input_parameter_name: "___fc_layer_0__.w0" } + height: 32 + width: 32 } parameters { name: "___conv_0__.w0" diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr index 1be2a7ceebfb74d677ac056dcc3a9f72fd31ccd6..c763a95f9d1aefa022f38e0beef6d1c86ebb360d 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr @@ -4,6 +4,8 @@ layers { type: "data" size: 2304 active_type: "" + height: 48 + width: 48 } layers { name: "__conv_0__" @@ -26,11 +28,15 @@ layers { filter_size_y: 3 padding_y: 1 stride_y: 1 + output_y: 48 + img_size_y: 48 } } bias_parameter_name: "___conv_0__.wbias" num_filters: 16 shared_biases: true + height: 48 + width: 48 } layers { name: "__maxout_layer_0__" @@ -40,12 +46,16 @@ layers { inputs { input_layer_name: "__conv_0__" maxout_conf { - channels: 16 + image_conf { + channels: 16 + img_size: 48 + img_size_y: 48 + } groups: 2 - img_size_x: 0 - img_size_y: 0 } } + height: 48 + width: 48 } layers { name: "__pool_0__" @@ -69,48 +79,58 @@ layers { padding_y: 0 } } + height: 24 + width: 24 } layers { name: "__conv_1__" type: "exconv" - size: 18432 + size: 73728 active_type: "" inputs { input_layer_name: "__pool_0__" input_parameter_name: "___conv_1__.w0" conv_conf { filter_size: 3 - channels: 32 + channels: 8 stride: 1 padding: 1 groups: 1 - filter_channels: 32 - output_x: 12 - img_size: 12 + filter_channels: 8 + output_x: 24 + img_size: 24 caffe_mode: true filter_size_y: 3 padding_y: 1 stride_y: 1 + output_y: 24 + img_size_y: 24 } } bias_parameter_name: "___conv_1__.wbias" num_filters: 128 shared_biases: true + height: 24 + width: 24 } layers { name: "__maxout_layer_1__" type: "maxout" - size: 9216 + size: 18432 active_type: "" inputs { - input_layer_name: "__conv_0__" + input_layer_name: "__conv_1__" maxout_conf { - channels: 128 + image_conf { + channels: 128 + img_size: 24 + img_size_y: 24 + } groups: 4 - img_size_x: 0 - img_size_y: 0 } } + height: 24 + width: 24 } layers { name: "__block_expand_layer_0__" @@ -118,7 +138,7 @@ layers { size: 192 active_type: "" inputs { - input_layer_name: "__maxout_layer_0__" + input_layer_name: "__maxout_layer_1__" block_expand_conf { channels: 32 stride_x: 1 @@ -133,6 +153,8 @@ layers { img_size_y: 0 } } + height: 24 + width: 24 } layers { name: "__fc_layer_0__" @@ -143,6 +165,8 @@ layers { input_layer_name: "__block_expand_layer_0__" input_parameter_name: "___fc_layer_0__.w0" } + height: 24 + width: 24 } parameters { name: "___conv_0__.w0" @@ -164,9 +188,9 @@ parameters { } parameters { name: "___conv_1__.w0" - size: 36864 + size: 9216 initial_mean: 0.0 - initial_std: 0.0833333333333 + initial_std: 0.166666666667 initial_strategy: 0 initial_smart: false } diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr index 8b0a8f2146b709ee67981049da8061597e1716be..ca1b2d8cffd6b472dfe40feeeb762e169bc853c7 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr @@ -4,6 +4,8 @@ layers { type: "data" size: 3200 active_type: "" + height: 20 + width: 10 } layers { name: "__spp_0__" @@ -13,13 +15,17 @@ layers { inputs { input_layer_name: "data" spp_conf { + image_conf { + channels: 16 + img_size: 10 + img_size_y: 20 + } pool_type: "max-projection" pyramid_height: 2 - channels: 16 - img_size: 10 - img_size_y: 20 } } + height: 1 + width: 5 } input_layer_names: "data" output_layer_names: "__spp_0__" diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py index e15a55b412f9459ecd89a0f654256097099c1398..be83f4f83c5d05ea2ffd9e3df0c09fb1a37a3e57 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py @@ -17,7 +17,7 @@ bilinear = bilinear_interp_layer(input=conv, out_size_x=64, out_size_y=64) pool = img_pool_layer( input=bilinear, - num_channels=4, + num_channels=16, pool_size=2, stride=2, pool_type=MaxPooling()) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py index 081430d716093877db6b2e44ac5417c37ede9a6e..eb14270baa0c4ca0b84d2121a80fde0b45eda54a 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py @@ -2,7 +2,7 @@ from paddle.trainer_config_helpers import * settings(batch_size=1000, learning_rate=1e-5) -data = data_layer(name='data', size=2304) +data = data_layer(name='data', size=2304, height=48, width=48) conv = img_conv_layer( input=data, @@ -21,16 +21,21 @@ pool = img_pool_layer( conv2 = img_conv_layer( input=pool, filter_size=3, - num_channels=32, + num_channels=8, num_filters=128, padding=1, act=LinearActivation(), bias_attr=True) -maxout2 = maxout_layer(input=conv, num_channels=128, groups=4) +maxout2 = maxout_layer(input=conv2, num_channels=128, groups=4) block = block_expand_layer( - input=maxout, num_channels=32, stride_x=1, stride_y=1, block_x=1, block_y=6) + input=maxout2, + num_channels=32, + stride_x=1, + stride_y=1, + block_x=1, + block_y=6) fc = fc_layer(input=block, size=384, bias_attr=False) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py index e20ffb584e8bdd86100455d4e55fe633b878e034..e0b0d0d3be252700d99f7097f0353df885efcf07 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py @@ -2,13 +2,9 @@ from paddle.trainer_config_helpers import * settings(batch_size=100, learning_rate=1e-5) -data = data_layer(name='data', size=3200) +data = data_layer(name='data', size=3200, height=20, width=10) spp = spp_layer( - input=data, - pyramid_height=2, - num_channels=16, - pool_type=MaxPooling(), - img_width=10) + input=data, pyramid_height=2, num_channels=16, pool_type=MaxPooling()) outputs(spp)