Support rectangle input for CNN

496d64eb · Luo Tao · 65612425 · 496d64eb · 496d64eb · 496d64eb
29 changed file
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -61,15 +61,10 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,

 void BatchNormBaseLayer::calFeatureMapSize() {
  const ImageConfig& conf = config_.inputs(0).image_conf();
-  if (inputLayers_[0]->getOutput().getFrameHeight() == 0 &&
-      inputLayers_[0]->getOutput().getFrameWidth() == 0) {
-    imgSize_ = conf.img_size();
-    imageH_ = imgSize_;
-    imageW_ = imgSize_;
-  } else {
-    imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-    imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  }
+  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imageH_ == 0) imageH_ = conf.img_size_y();
+  if (imageW_ == 0) imageW_ = conf.img_size();
  imgPixels_ = imageH_ * imageW_;
  getOutput().setFrameHeight(imageH_);
  getOutput().setFrameWidth(imageW_);

--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -78,9 +78,8 @@ protected:
  MatrixPtr savedMean_;
  MatrixPtr savedInvVar_;

-  /// Height or width of input image feature, now height is equal to width.
-  /// imgSize is 1 if the input is fully-connected layer.
-  int imgSize_;
+  /// Height or width of input image feature.
+  /// Both of them are 1 if the input is fully-connected layer.
  int imageH_;
  int imageW_;
  /// Height * Width.

--- a/paddle/gserver/layers/BilinearInterpLayer.cpp
+++ b/paddle/gserver/layers/BilinearInterpLayer.cpp
@@ -26,15 +26,15 @@ size_t BilinearInterpLayer::getSize() {

  const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
  if (inImgH_ == 0) {
-    inImgH_ = conf.img_size_y();
+    inImgH_ = conf.image_conf().img_size_y();
  }
  if (inImgW_ == 0) {
-    inImgW_ = conf.img_size_x();
+    inImgW_ = conf.image_conf().img_size();
  }

  outImgH_ = conf.out_size_y();
  outImgW_ = conf.out_size_x();
-  numChannels_ = conf.num_channels();
+  numChannels_ = conf.image_conf().channels();

  CHECK(outImgH_ > 0 && outImgW_ > 0);
  CHECK(inImgH_ > 0 && inImgW_ > 0);

--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -37,11 +37,13 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    filterSizeY_.push_back(conf.filter_size_y());
    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
    channels_.push_back(conf.channels());
-    imgSizeH_.push_back(conf.img_size());
+    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y() :
+                        conf.img_size());
    imgSizeW_.push_back(conf.img_size());
    groups_.push_back(conf.groups());
    filterChannels_.push_back(conf.filter_channels());
-    outputH_.push_back(conf.output_x());
+    outputH_.push_back(conf.has_output_y() ? conf.output_y() :
+                       conf.output_x());
    outputW_.push_back(conf.output_x());
  }

@@ -90,11 +92,12 @@ size_t ConvBaseLayer::calOutputSize() {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
       inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
       inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+       const ConvConfig& conf = config_.inputs(i).conv_conf();
       if (isDeconv_) {
         if (inH[i] == 0)
-           inH[i] = config_.inputs(i).conv_conf().output_x();
+           inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
         if (inW[i] == 0)
-           inW[i] = config_.inputs(i).conv_conf().output_x();
+           inW[i] = conf.output_x();
         outH.push_back(
             imageSize(inH[i], filterSizeY_[i], paddingY_[i], strideY_[i],
                       caffeMode_));
@@ -103,9 +106,9 @@ size_t ConvBaseLayer::calOutputSize() {
                       caffeMode_));
       } else {
         if (inH[i] == 0)
-           inH[i] = config_.inputs(i).conv_conf().img_size();
+           inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
         if (inW[i] == 0)
-           inW[i] = config_.inputs(i).conv_conf().img_size();
+           inW[i] = conf.img_size();
         outH.push_back(
             outputSize(inH[i], filterSizeY_[i], paddingY_[i], strideY_[i],
                        caffeMode_));

--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -93,9 +93,9 @@ private:
  bool caffeMode_;
  int inputOffset_, outputOffset_, weightOffset_;
  int numFilters_;
-  int padding_, stride_, filterSize_, channels_, imgSize_;
+  int padding_, stride_, filterSize_, channels_, imgSize_, imgSizeY_;
  int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputs_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;

  /// Following member variables are same with CudnnConvLayer.
  /// There is no explanation here.
@@ -144,7 +144,7 @@ void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
 void ConvOperator::reshape(int batchSize) {
  imageH_ = ins_[0]->getFrameHeight();
  imageW_ = ins_[0]->getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSize_;
+  if (imageH_ == 0) imageH_ = imgSizeY_;
  if (imageW_ == 0) imageW_ = imgSize_;
  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
@@ -176,7 +176,10 @@ void ConvOperator::computeConvSizes() {
  hl_create_tensor_descriptor(&inputDesc_);
  int outputX =
      outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_);
+  int outputY =
+      outputSize(imgSizeY_, filterSizeY_, paddingY_, strideY_, caffeMode_);
  CHECK_EQ(outputX, outputX_);
+  CHECK_EQ(outputY, outputY_);
  hl_create_tensor_descriptor(&outputDesc_);
  hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
                                   paddingY_, padding_, strideY_, stride_);
@@ -208,10 +211,12 @@ void ConvOperator::getConvParams() {
  filterPixels_ = filterSize_ * filterSizeY_;
  channels_ = conf.channels();
  imgSize_ = conf.img_size();
-  imgPixels_ = imgSize_ * imgSize_;
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  imgPixels_ = imgSize_ * imgSizeY_;
  CHECK_EQ(conf.groups(), 1U);
  filterChannels_ = conf.filter_channels();
  outputX_ = conf.output_x();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
  outputs_ = outputX_ * outputX_;
 }


--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -47,7 +47,7 @@ void ConvProjection::getConvParams() {
  filterH_ = conf.filter_size_y();
  filterW_ = conf.filter_size();

-  configImgH_ = conf.img_size();
+  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
  configImgW_ = conf.img_size();

  channels_ = conf.channels();

--- a/paddle/gserver/layers/DataLayer.cpp
+++ b/paddle/gserver/layers/DataLayer.cpp
@@ -48,8 +48,8 @@ void DataLayer::copyDataToOutput(Argument& output) {
      output.ids->copyFrom(*data_.ids);
    }
  }
-  output.setFrameHeight(data_.getFrameHeight());
-  output.setFrameWidth(data_.getFrameWidth());
+  output.setFrameHeight(config_.height());
+  output.setFrameWidth(config_.width());
  output.cpuSequenceDims = data_.cpuSequenceDims;
  output.sequenceStartPositions = data_.sequenceStartPositions;
  output.subSequenceStartPositions = data_.subSequenceStartPositions;

--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -30,17 +30,19 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
   * meaning as in conv, we need to swap channels_ and numFilters here for
   * convTrans, and in other functions too.
   * */
-  int channel;
-  int numFilters;
+
  /* Initialize the projection */
  for (auto &inputConfig : config_.inputs()) {
    const ConvConfig &conf = inputConfig.conv_conf();
-    numFilters = isDeconv_ ? conf.channels() : numFilters_;
+    int numFilters = isDeconv_ ? conf.channels() : numFilters_;
    subM_.push_back(numFilters / conf.groups());
-    subN_.push_back(conf.output_x() * conf.output_x());
-    channel = isDeconv_ ? numFilters_ : conf.channels();
-    subK_.push_back(channel * conf.filter_size() * conf.filter_size() /
-                    conf.groups());
+    subN_.push_back(conf.output_x() *
+                    (conf.has_output_y() ? conf.output_y() : conf.output_x()));
+    int channel = isDeconv_ ? numFilters_ : conf.channels();
+    subK_.push_back(
+        channel * conf.filter_size() *
+        (conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) /
+        conf.groups());
    /* Consistent caffe mode for multiple input */
    caffeMode_ = conf.caffe_mode();
  }
@@ -107,9 +109,9 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image, size_t startIdx,
      imgData, 1, imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel, false,
      useGpu_);
  expandInput_->convExpand(*imageTmp, imgSizeH_[inIdx], imgSizeW_[inIdx],
-                           channel, filterSize_[inIdx],
-                           filterSize_[inIdx], stride_[inIdx], stride_[inIdx],
-                           padding_[inIdx], padding_[inIdx],
+                           channel, filterSizeY_[inIdx],
+                           filterSize_[inIdx], strideY_[inIdx], stride_[inIdx],
+                           paddingY_[inIdx], padding_[inIdx],
                           outputH_[inIdx], outputW_[inIdx]);
  imageTmp->clear();
 }
@@ -188,10 +190,10 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image,
        imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel, false,
        useGpu_);
    vTmp->convShrink(*oneGradTmp, imgSizeH_[inpIdx], imgSizeW_[inpIdx],
-                     channel, filterSize_[inpIdx],
-                     filterSize_[inpIdx], stride_[inpIdx], stride_[inpIdx],
-                     padding_[inpIdx], padding_[inpIdx],
-                     outputH_[inpIdx], outputW_[inpIdx], 1.0f, 1.0f);
+                     channel, filterSizeY_[inpIdx],
+                     filterSize_[inpIdx], strideY_[inpIdx], stride_[inpIdx],
+                     paddingY_[inpIdx], padding_[inpIdx], outputH_[inpIdx],
+                     outputW_[inpIdx], 1.0f, 1.0f);
    vTmp->clear();
    oneGradTmp->clear();


--- a/paddle/gserver/layers/MaxOutLayer.cpp
+++ b/paddle/gserver/layers/MaxOutLayer.cpp
@@ -25,10 +25,10 @@ size_t MaxOutLayer::getSize() {
  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
  if (imgSizeH_ == 0) {
-    imgSizeH_ = maxoutConf.img_size_y();
+    imgSizeH_ = maxoutConf.image_conf().img_size_y();
  }
  if (imgSizeW_ == 0) {
-    imgSizeW_ = maxoutConf.img_size_x();
+    imgSizeW_ = maxoutConf.image_conf().img_size();
  }

  featLen_ = imgSizeH_ * imgSizeW_;
@@ -50,7 +50,7 @@ bool MaxOutLayer::init(const LayerMap& layerMap,

  const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
  groups_ = conf.groups();
-  channels_ = conf.channels();
+  channels_ = conf.image_conf().channels();
  CHECK_EQ(channels_ % groups_, 0UL);
  outputChannels_ = channels_ / groups_;


--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -49,6 +49,9 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
  outputX_ = conf.output_x();
  imgSize_ = conf.img_size();
  denoms_ = NULL;
+
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
  return true;
 }


--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -50,7 +50,7 @@ public:
 */
 class ResponseNormLayer : public NormLayer {
 protected:
-  size_t channels_, size_, outputX_, imgSize_;
+  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
  float scale_, pow_;
  MatrixPtr denoms_;


--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -24,7 +24,7 @@ size_t CMRProjectionNormLayer::getSize() {
  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
  if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSize_;
+    imgSizeH_ = imgSizeY_;
  }
  if (imgSizeW_ == 0) {
    imgSizeW_ = imgSize_;

--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
@@ -56,14 +56,14 @@ ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW,
 size_t SpatialPyramidPoolLayer::getSize() {
  CHECK_EQ(inputLayers_.size(), 1UL);
  size_t layerSize = 0;
-  const SppConfig& sppConf = config_.inputs(0).spp_conf();
+  const ImageConfig& conf = config_.inputs(0).spp_conf().image_conf();
  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
  if (imgSizeH_ == 0) {
-    imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+    imgSizeH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
  }
  if (imgSizeW_ == 0) {
-    imgSizeW_ = sppConf.img_size();
+    imgSizeW_ = conf.img_size();
  }

  size_t outputH = 1;
@@ -82,9 +82,10 @@ bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap,
  pyramidHeight_ = sppConf.pyramid_height();
  poolType_ = sppConf.pool_type();

-  channels_ = sppConf.channels();
-  imgSizeW_ = sppConf.img_size();
-  imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+  const ImageConfig& imageConf = sppConf.image_conf();
+  channels_ = imageConf.channels();
+  imgSizeW_ = imageConf.img_size();
+  imgSizeH_ = imageConf.has_img_size_y() ? imageConf.img_size_y() : imgSizeW_;
  poolProjections_.reserve(pyramidHeight_);
  projCol_.reserve(pyramidHeight_);
  projOutput_.resize(pyramidHeight_);

--- a/paddle/gserver/tests/img_pool_a.conf
+++ b/paddle/gserver/tests/img_pool_a.conf
@@ -28,7 +28,6 @@ maxpool = img_pool_layer(input=conv,
                         stride_y=2,
                         padding=1,
                         padding_y=2,
-                         img_width=16,
                         pool_type=MaxPooling(),
 )
 avgpool = img_pool_layer(input=conv,
@@ -39,7 +38,6 @@ avgpool = img_pool_layer(input=conv,
                         stride_y=2,
                         padding=1,
                         padding_y=2,
-                         img_width=16,
                         pool_type=AvgPooling(),
 )


--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -194,9 +194,10 @@ TEST(Layer, BilinearInterpLayer) {

  LayerInputConfig* input = config.layerConfig.add_inputs();
  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
-  bilinear->set_img_size_x(32);
-  bilinear->set_img_size_y(32);
-  bilinear->set_num_channels(4);
+  ImageConfig* image = bilinear->mutable_image_conf();
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);

  for (auto useGpu : {false, true}) {
    for (auto outSize : {32, 64}) {
@@ -314,7 +315,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
  config.layerConfig.set_partial_sum(1);
  config.layerConfig.set_shared_biases(true);

-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 288});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
  LayerInputConfig* input = config.layerConfig.add_inputs();
  ConvConfig* conv = input->mutable_conv_conf();
  conv->set_filter_size(2);
@@ -327,10 +328,14 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
  conv->set_groups(1);
  conv->set_filter_channels(conv->channels() / conv->groups());
  conv->set_img_size(16);
+  conv->set_img_size_y(8);
  conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(),
                                conv->padding(), conv->stride(),
                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+  conv->set_output_y(outputSize(conv->img_size_y(), conv->filter_size_y(),
+                                conv->padding_y(), conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
                              config.layerConfig.num_filters());

  testLayerGrad(config, "conv", 100, trans, useGpu);
@@ -427,10 +432,11 @@ TEST(Layer, maxoutLayer) {
  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
  LayerInputConfig* input = config.layerConfig.add_inputs();
  MaxOutConfig* maxout = input->mutable_maxout_conf();
+  ImageConfig* image = maxout->mutable_image_conf();

-  maxout->set_img_size_x(32);
-  maxout->set_img_size_y(32);
-  maxout->set_channels(4);
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
  maxout->set_groups(2);

  for (auto useGpu : {false, true}) {
@@ -902,7 +908,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
  config.layerConfig.set_type("norm");
  config.layerConfig.set_active_type("relu");

-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
  LayerInputConfig* input = config.layerConfig.add_inputs();
  NormConfig* norm = input->mutable_norm_conf();
  norm->set_norm_type(normType);
@@ -912,7 +918,9 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
  norm->set_pow(0.75);
  norm->set_blocked(0);
  norm->set_img_size(14);
+  norm->set_img_size_y(7);
  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
  if (norm->norm_type() == "cmrnorm" ||
      norm->norm_type() == "cmrnorm-projection") {
    norm->set_scale(norm->scale() / norm->size());
@@ -920,7 +928,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
  }

-  config.layerConfig.set_size(norm->output_x() * norm->output_x() *
+  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
                              norm->channels());
  config.biasSize = 0;

@@ -1018,11 +1026,12 @@ void testSppLayer(const string& poolType, const int pyramidHeight, bool trans,
  SppConfig* sppConfig = input->mutable_spp_conf();
  sppConfig->set_pool_type(poolType);
  sppConfig->set_pyramid_height(pyramidHeight);
-  sppConfig->set_channels(16);
-  sppConfig->set_img_size(10);
-  sppConfig->set_img_size_y(20);
+  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
+  imageConfig->set_channels(16);
+  imageConfig->set_img_size(10);
+  imageConfig->set_img_size_y(20);
  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
-  config.layerConfig.set_size(outputSize * sppConfig->channels());
+  config.layerConfig.set_size(outputSize * imageConfig->channels());
  testLayerGrad(config, "spp", 100, trans, useGpu);
 }

@@ -1328,12 +1337,13 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
  TestConfig config;
  const int CHANNELS = 10;
  const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+  config.layerConfig.set_size(size);
  config.layerConfig.set_active_type("sigmoid");
  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0",
-                              /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", /* dim= */ size,
                              /* paraSize= */ CHANNELS});

  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
@@ -1348,6 +1358,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
  ImageConfig* img_conf = input->mutable_image_conf();
  img_conf->set_channels(CHANNELS);
  img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);

  testLayerGrad(config, "batch_norm", 64, /* trans= */ trans, useGpu,
                /* useWeight */ true);
@@ -1370,6 +1381,7 @@ TEST(Operator, conv) {
  const int FILTER_SIZE_Y = 3;
  const int CHANNELS = 3;
  const int IMAGE_SIZE = 16;
+  const int IMAGE_SIZE_Y = 8;
  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
  operatorConf.set_type("conv");
  ConvConfig* conv = operatorConf.mutable_conv_conf();
@@ -1384,17 +1396,18 @@ TEST(Operator, conv) {
  conv->set_groups(1);
  conv->set_filter_channels(conv->channels() / conv->groups());
  conv->set_img_size(IMAGE_SIZE);
-  int output_x =
-      outputSize(conv->img_size(), conv->filter_size(), conv->padding(),
-                 conv->stride(), /* caffeMode */ true);
-  conv->set_output_x(output_x);
-  config.layerConfig.set_size(output_x * output_x *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(),
+                                conv->padding(), conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(), conv->filter_size_y(),
+                                conv->padding_y(), conv->stride_y(),
+                                /*  caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
                              NUM_FILTERS);

  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE * CHANNELS, 0});
+      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
  config.inputDefs.push_back(
      {INPUT_DATA, "layer_1",
       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS, 0});

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -203,6 +203,8 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
  }
  resizeAndCopy(udp, src.udp, useGpu, stream);
  resizeAndCopy(strs, src.strs, useGpu, stream);
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
 }

 int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,

--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
@@ -59,7 +59,6 @@ pool = img_pool_layer(input=fc2,
                      padding_y=2,
                      stride=2,
                      stride_y=3,
-                      img_width=3,
                      pool_type=CudnnAvgPooling())

 concat = concat_layer(input=[fc3, fc4])

--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -76,6 +76,12 @@ message ConvConfig {
  required uint32 filter_size_y = 10;
  required uint32 padding_y = 11;
  required uint32 stride_y = 12;
+
+  // if not set, use output_x
+  optional uint32 output_y = 13 [default = 0];
+
+  // if not set, use img_size
+  optional uint32 img_size_y = 14 [default = 0];
 }

 message PoolConfig {
@@ -121,11 +127,9 @@ message PoolConfig {
 }

 message SppConfig {
-  required string pool_type = 1;
-  required uint32 pyramid_height = 2;
-  required uint32 channels = 3;
-  required uint32 img_size = 4;
-  optional uint32 img_size_y = 5;
+  required ImageConfig image_conf = 1;
+  required string pool_type = 2;
+  required uint32 pyramid_height = 3;
 }

 message NormConfig {
@@ -155,6 +159,12 @@ message NormConfig {
  // fixed window: shared a fixed window for each value
  // sliding window: have a different window for each value
  optional bool blocked = 8;
+
+  // if not set, use output_x
+  optional uint32 output_y = 9 [default = 0];
+
+  // if not set, use img_size
+  optional uint32 img_size_y = 10 [default = 0];
 }

 message BlockExpandConfig {
@@ -179,12 +189,8 @@ message BlockExpandConfig {
 }

 message MaxOutConfig {
-  required uint32 channels = 1;
+  required ImageConfig image_conf = 1;
  required uint32 groups = 2;
-
-  // The size of input feature map.
-  required uint32 img_size_x = 3;
-  required uint32 img_size_y = 4;
 }

 message ProjectionConfig {
@@ -225,12 +231,10 @@ message OperatorConfig {

 message BilinearInterpConfig {
  // The size of input feature map.
-  optional uint32 img_size_x = 1;
-  optional uint32 img_size_y = 2;
+  required ImageConfig image_conf = 1;
  // The size of output feature map.
-  required uint32 out_size_x = 3;
-  required uint32 out_size_y = 4;
-  required uint32 num_channels = 5;
+  required uint32 out_size_x = 2;
+  required uint32 out_size_y = 3;
 }

 message ImageConfig {
@@ -240,6 +244,7 @@ message ImageConfig {

  // The size of input feature map.
  required uint32 img_size = 8;
+  required uint32 img_size_y = 9;
 }

 message LayerInputConfig {
@@ -412,7 +417,10 @@ sinclude(`ModelConfigLayer.proto.m4')
  // string type is used for flexibility: different types can be converted
  // to string and reinterpreted in the user's own layer implementation.  
  optional string user_arg = 49;
-
+  
+  // to indicate rectangle image data
+  optional uint64 height = 50;
+  optional uint64 width = 51;
 }

 message EvaluatorConfig {

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -763,7 +763,7 @@ def mixed_layer(size=0,


 @layer_support()
-def data_layer(name, size, layer_attr=None):
+def data_layer(name, size, height=None, width=None, layer_attr=None):
    """
    Define DataLayer For NeuralNetwork.

@@ -778,6 +778,10 @@ def data_layer(name, size, layer_attr=None):
    :type name: basestring
    :param size: Size of this data layer.
    :type size: int
+    :param height: Height of this data layer, used for image
+    :type size: int|None
+    :param width: Width of this data layer, used for image
+    :type size: int|None
    :param layer_attr: Extra Layer Attribute.
    :type layer_attr: ExtraLayerAttribute.
    :return: LayerOutput object.
@@ -787,6 +791,8 @@ def data_layer(name, size, layer_attr=None):
        type=LayerType.DATA,
        name=name,
        size=size,
+        height=height,
+        width=width,
        **ExtraLayerAttribute.to_kwargs(layer_attr))

    return LayerOutput(name, LayerType.DATA, size=size)
@@ -1480,7 +1486,7 @@ def bilinear_interp_layer(input,
            bilinear_interp=BilinearInterp(
                out_size_x=out_size_x,
                out_size_y=out_size_y,
-                num_channels=num_channels)),
+                channels=num_channels)),
        type=LayerType.BILINEAR_INTERP_LAYER,
        **ExtraLayerAttribute.to_kwargs(layer_attr))
    return LayerOutput(
@@ -1908,8 +1914,7 @@ def img_pool_layer(input,
                   layer_attr=None,
                   pool_size_y=None,
                   stride_y=None,
-                   padding_y=None,
-                   img_width=None):
+                   padding_y=None):
    """
    Image pooling Layer.

@@ -1940,9 +1945,6 @@ def img_pool_layer(input,
    :type stride_y: int|None
    :param layer_attr: Extra Layer attribute.
    :type layer_attr: ExtraLayerAttribute
-    :param img_width: the width of input feature map. If it is None, the input feature
-                      map should be square.
-    :type img_width: int|None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -1978,8 +1980,7 @@ def img_pool_layer(input,
                    padding=padding,
                    size_y=pool_size_y,
                    stride_y=stride_y,
-                    padding_y=padding_y,
-                    img_width=img_width))
+                    padding_y=padding_y))
        ],
        **ExtraLayerAttribute.to_kwargs(layer_attr))
    return LayerOutput(
@@ -1997,7 +1998,6 @@ def spp_layer(input,
              num_channels=None,
              pool_type=None,
              pyramid_height=None,
-              img_width=None,
              layer_attr=None):
    """
    Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition.
@@ -2014,9 +2014,6 @@ def spp_layer(input,
    :type scale: BasePoolingType
    :param pyramid_height: pyramid height.
    :type pyramid_height: int
-    :param img_width: the width of input feature map. If it is None, the input feature
-                      map should be square.
-    :type img_width: int|None
    :param layer_attr: Extra Layer Attribute.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
@@ -2043,8 +2040,7 @@ def spp_layer(input,
            spp=SpatialPyramidPool(
                pool_type=type_name,
                channels=num_channels,
-                pyramid_height=pyramid_height,
-                img_width=img_width)),
+                pyramid_height=pyramid_height)),
        **ExtraLayerAttribute.to_kwargs(layer_attr))
    return LayerOutput(
        name,

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -26,11 +26,15 @@ layers {
      filter_size_y: 32
      padding_y: 1
      stride_y: 1
+      output_y: 227
+      img_size_y: 256
    }
  }
  bias_parameter_name: "___conv_0__.wbias"
  num_filters: 64
  shared_biases: true
+  height: 227
+  width: 227
 }
 layers {
  name: "__batch_norm_0__"
@@ -43,6 +47,7 @@ layers {
    image_conf {
      channels: 64
      img_size: 227
+      img_size_y: 227
    }
  }
  inputs {
@@ -55,6 +60,8 @@ layers {
  }
  bias_parameter_name: "___batch_norm_0__.wbias"
  moving_average_fraction: 0.9
+  height: 227
+  width: 227
 }
 layers {
  name: "__crmnorm_0__"
@@ -72,8 +79,12 @@ layers {
      output_x: 227
      img_size: 227
      blocked: false
+      output_y: 227
+      img_size_y: 227
    }
  }
+  height: 227
+  width: 227
 }
 layers {
  name: "__pool_0__"
@@ -97,6 +108,8 @@ layers {
      padding_y: 0
    }
  }
+  height: 196
+  width: 196
 }
 parameters {
  name: "___conv_0__.w0"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -26,6 +26,8 @@ layers {
      filter_size_y: 32
      padding_y: 1
      stride_y: 1
+      output_y: 227
+      img_size_y: 198
    }
  }
  bias_parameter_name: "___conv_0__.wbias"
@@ -43,6 +45,7 @@ layers {
    image_conf {
      channels: 64
      img_size: 256
+      img_size_y: 256
    }
  }
  inputs {
@@ -55,6 +58,8 @@ layers {
  }
  bias_parameter_name: "___batch_norm_0__.wbias"
  moving_average_fraction: 0.9
+  height: 256
+  width: 256
 }
 layers {
  name: "__crmnorm_0__"
@@ -72,8 +77,12 @@ layers {
      output_x: 256
      img_size: 256
      blocked: false
+      output_y: 256
+      img_size_y: 256
    }
  }
+  height: 256
+  width: 256
 }
 layers {
  name: "__pool_0__"
@@ -97,6 +106,8 @@ layers {
      padding_y: 0
    }
  }
+  height: 225
+  width: 225
 }
 parameters {
  name: "___conv_0__.w0"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -177,6 +177,8 @@ layers {
      filter_size_y: 3
      padding_y: 0
      stride_y: 1
+      output_y: 30
+      img_size_y: 32
    }
    num_filters: 64
  }

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -26,11 +26,15 @@ layers {
      filter_size_y: 3
      padding_y: 1
      stride_y: 1
+      output_y: 48
+      img_size_y: 48
    }
  }
  bias_parameter_name: "___conv_0__.wbias"
  num_filters: 16
  shared_biases: true
+  height: 48
+  width: 48
 }
 layers {
  name: "__bilinear_interp_layer_0__"
@@ -40,11 +44,17 @@ layers {
  inputs {
    input_layer_name: "__conv_0__"
    bilinear_interp_conf {
+      image_conf {
+        channels: 16
+        img_size: 48
+        img_size_y: 48
+      }
      out_size_x: 64
      out_size_y: 64
-      num_channels: 16
    }
  }
+  height: 64
+  width: 64
 }
 layers {
  name: "__pool_0__"
@@ -55,19 +65,21 @@ layers {
    input_layer_name: "__bilinear_interp_layer_0__"
    pool_conf {
      pool_type: "max-projection"
-      channels: 4
+      channels: 16
      size_x: 2
      stride: 2
-      output_x: 64
-      img_size: 128
+      output_x: 32
+      img_size: 64
      padding: 0
      size_y: 2
      stride_y: 2
-      output_y: 64
-      img_size_y: 128
+      output_y: 32
+      img_size_y: 64
      padding_y: 0
    }
  }
+  height: 32
+  width: 32
 }
 layers {
  name: "__fc_layer_0__"
@@ -78,6 +90,8 @@ layers {
    input_layer_name: "__pool_0__"
    input_parameter_name: "___fc_layer_0__.w0"
  }
+  height: 32
+  width: 32
 }
 parameters {
  name: "___conv_0__.w0"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -4,6 +4,8 @@ layers {
  type: "data"
  size: 2304
  active_type: ""
+  height: 48
+  width: 48
 }
 layers {
  name: "__conv_0__"
@@ -26,11 +28,15 @@ layers {
      filter_size_y: 3
      padding_y: 1
      stride_y: 1
+      output_y: 48
+      img_size_y: 48
    }
  }
  bias_parameter_name: "___conv_0__.wbias"
  num_filters: 16
  shared_biases: true
+  height: 48
+  width: 48
 }
 layers {
  name: "__maxout_layer_0__"
@@ -40,12 +46,16 @@ layers {
  inputs {
    input_layer_name: "__conv_0__"
    maxout_conf {
-      channels: 16
+      image_conf {
+        channels: 16
+        img_size: 48
+        img_size_y: 48
+      }
      groups: 2
-      img_size_x: 0
-      img_size_y: 0
    }
  }
+  height: 48
+  width: 48
 }
 layers {
  name: "__pool_0__"
@@ -69,48 +79,58 @@ layers {
      padding_y: 0
    }
  }
+  height: 24
+  width: 24
 }
 layers {
  name: "__conv_1__"
  type: "exconv"
-  size: 18432
+  size: 73728
  active_type: ""
  inputs {
    input_layer_name: "__pool_0__"
    input_parameter_name: "___conv_1__.w0"
    conv_conf {
      filter_size: 3
-      channels: 32
+      channels: 8
      stride: 1
      padding: 1
      groups: 1
-      filter_channels: 32
-      output_x: 12
-      img_size: 12
+      filter_channels: 8
+      output_x: 24
+      img_size: 24
      caffe_mode: true
      filter_size_y: 3
      padding_y: 1
      stride_y: 1
+      output_y: 24
+      img_size_y: 24
    }
  }
  bias_parameter_name: "___conv_1__.wbias"
  num_filters: 128
  shared_biases: true
+  height: 24
+  width: 24
 }
 layers {
  name: "__maxout_layer_1__"
  type: "maxout"
-  size: 9216
+  size: 18432
  active_type: ""
  inputs {
-    input_layer_name: "__conv_0__"
+    input_layer_name: "__conv_1__"
    maxout_conf {
-      channels: 128
+      image_conf {
+        channels: 128
+        img_size: 24
+        img_size_y: 24
+      }
      groups: 4
-      img_size_x: 0
-      img_size_y: 0
    }
  }
+  height: 24
+  width: 24
 }
 layers {
  name: "__block_expand_layer_0__"
@@ -118,7 +138,7 @@ layers {
  size: 192
  active_type: ""
  inputs {
-    input_layer_name: "__maxout_layer_0__"
+    input_layer_name: "__maxout_layer_1__"
    block_expand_conf {
      channels: 32
      stride_x: 1
@@ -133,6 +153,8 @@ layers {
      img_size_y: 0
    }
  }
+  height: 24
+  width: 24
 }
 layers {
  name: "__fc_layer_0__"
@@ -143,6 +165,8 @@ layers {
    input_layer_name: "__block_expand_layer_0__"
    input_parameter_name: "___fc_layer_0__.w0"
  }
+  height: 24
+  width: 24
 }
 parameters {
  name: "___conv_0__.w0"
@@ -164,9 +188,9 @@ parameters {
 }
 parameters {
  name: "___conv_1__.w0"
-  size: 36864
+  size: 9216
  initial_mean: 0.0
-  initial_std: 0.0833333333333
+  initial_std: 0.166666666667
  initial_strategy: 0
  initial_smart: false
 }

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
@@ -4,6 +4,8 @@ layers {
  type: "data"
  size: 3200
  active_type: ""
+  height: 20
+  width: 10
 }
 layers {
  name: "__spp_0__"
@@ -13,13 +15,17 @@ layers {
  inputs {
    input_layer_name: "data"
    spp_conf {
+      image_conf {
+        channels: 16
+        img_size: 10
+        img_size_y: 20
+      }
      pool_type: "max-projection"
      pyramid_height: 2
-      channels: 16
-      img_size: 10
-      img_size_y: 20
    }
  }
+  height: 1
+  width: 5
 }
 input_layer_names: "data"
 output_layer_names: "__spp_0__"

--- a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
@@ -17,7 +17,7 @@ bilinear = bilinear_interp_layer(input=conv, out_size_x=64, out_size_y=64)

 pool = img_pool_layer(
    input=bilinear,
-    num_channels=4,
+    num_channels=16,
    pool_size=2,
    stride=2,
    pool_type=MaxPooling())

--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
@@ -2,7 +2,7 @@ from paddle.trainer_config_helpers import *

 settings(batch_size=1000, learning_rate=1e-5)

-data = data_layer(name='data', size=2304)
+data = data_layer(name='data', size=2304, height=48, width=48)

 conv = img_conv_layer(
    input=data,
@@ -21,16 +21,21 @@ pool = img_pool_layer(
 conv2 = img_conv_layer(
    input=pool,
    filter_size=3,
-    num_channels=32,
+    num_channels=8,
    num_filters=128,
    padding=1,
    act=LinearActivation(),
    bias_attr=True)

-maxout2 = maxout_layer(input=conv, num_channels=128, groups=4)
+maxout2 = maxout_layer(input=conv2, num_channels=128, groups=4)

 block = block_expand_layer(
-    input=maxout, num_channels=32, stride_x=1, stride_y=1, block_x=1, block_y=6)
+    input=maxout2,
+    num_channels=32,
+    stride_x=1,
+    stride_y=1,
+    block_x=1,
+    block_y=6)

 fc = fc_layer(input=block, size=384, bias_attr=False)


--- a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
@@ -2,13 +2,9 @@ from paddle.trainer_config_helpers import *

 settings(batch_size=100, learning_rate=1e-5)

-data = data_layer(name='data', size=3200)
+data = data_layer(name='data', size=3200, height=20, width=10)

 spp = spp_layer(
-    input=data,
-    pyramid_height=2,
-    num_channels=16,
-    pool_type=MaxPooling(),
-    img_width=10)
+    input=data, pyramid_height=2, num_channels=16, pool_type=MaxPooling())

 outputs(spp)