Merge pull request #1231 from yu239/rotate_and_flip

One bug fix and two new features

Merge pull request #1231 from yu239/rotate_and_flip
One bug fix and two new features
b9dfe8e7 · Haonan · GitHub · 9763761f · 73dcf2cd · b9dfe8e7
22 changed file
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -267,4 +267,16 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
                                          const int dimN,
                                          real scale);
+/**
+ * @brief  Matrix rotation in 90 degrees
+ *
+ * @param[in]   mat       input matrix (M x N).
+ * @param[out]  matRot    output matrix (N x M).
+ * @param[in]   dimM      input matrix height.
+ * @param[in]   dimN      input matrix width.
+ * @param[in]   clockWise rotation direction
+ */
+extern void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise);
 #endif /* HL_MATRIX_H_ */
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -106,4 +106,8 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
                                          const int dimM,
                                          const int dimN,
                                          real scale) {}
+inline void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
 #endif  // HL_MATRIX_STUB_H_
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -840,3 +840,28 @@ void hl_matrix_collect_shared_bias(real* B_d,
      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
 }
+__global__ void keMatrixRotate(real* mat, real* matRot,
+                               int dimM, int dimN, bool clockWise) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < dimM * dimN) {
+        int i = idx / dimN;
+        int j = idx % dimN;
+        if (clockWise) {
+            matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
+        } else {
+            matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
+        }
+    }
+}
+void hl_matrix_rotate(real *mat, real* matRot,
+                      int dimM, int dimN, bool clockWise) {
+    CHECK_NOTNULL(mat);
+    CHECK_NOTNULL(matRot);
+    const int threads = 512;
+    const int blocks = DIVUP(dimM * dimN, threads);
+    keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
+            (mat, matRot, dimM, dimN, clockWise);
+    CHECK_SYNC("hl_matrix_rotate failed");
+}
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -95,6 +95,9 @@ void FeatureMapExpandLayer::forward(PassType passType) {
 void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
  MatrixPtr inGrad = getInputGrad(0);
+  if (NULL == inGrad) {
+    return;
+  }
  MatrixPtr outGrad = getOutputGrad();
  size_t batchSize = getInput(0).getBatchSize();
  int imgSize = inGrad->getWidth();

--- a/paddle/gserver/layers/RotateLayer.cpp
+++ b/paddle/gserver/layers/RotateLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "RotateLayer.h"
+namespace paddle {
+REGISTER_LAYER(rotate, RotateLayer);
+bool RotateLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  height_ = config_.height();
+  width_ = config_.width();
+  CHECK_GT(height_, 0);
+  CHECK_GT(width_, 0);
+  return true;
+}
+void RotateLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = getInputValue(0);
+  batchSize_ = input->getHeight();
+  size_ = input->getWidth();
+  CHECK_GE(size_, height_ * width_);
+  CHECK_EQ(size_ % (height_ * width_), 0)
+      << "total size_ is not dividable by (height_ * width_), i.e., "
+      << "channel number should be an integer";
+  channels_ = size_ / (height_ * width_);
+  resizeOutput(batchSize_, size_);
+  MatrixPtr outV = getOutputValue();
+  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
+    for (int c = 0; c < channels_; c++) {  // for each feat channel
+      MatrixPtr inputSample =
+          Matrix::create(input->getData() + b * size_ + c * height_ * width_,
+                         height_,
+                         width_,
+                         false,
+                         useGpu_);
+      MatrixPtr outputSample =
+          Matrix::create(outV->getData() + b * size_ + c * height_ * width_,
+                         width_,
+                         height_,
+                         false,
+                         useGpu_);
+      inputSample->rotate(outputSample, false, true /* clock-wise */);
+    }
+  }
+  if (getInputGrad(0)) {
+    zeroGrad();
+  }
+}
+void RotateLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  MatrixPtr outputGrad = getOutputGrad();
+  if (outputGrad == NULL) {
+    return;
+  }
+  // the grad should be rotated in the reverse direction
+  MatrixPtr preGrad = getInputGrad(0);
+  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
+    for (int c = 0; c < channels_; c++) {  // for each feat channel
+      MatrixPtr inputSampleGrad =
+          Matrix::create(preGrad->getData() + b * size_ + c * height_ * width_,
+                         height_,
+                         width_,
+                         false,
+                         useGpu_);
+      MatrixPtr outputSampleGrad = Matrix::create(
+          outputGrad->getData() + b * size_ + c * height_ * width_,
+          width_,
+          height_,
+          false,
+          useGpu_);
+      MatrixPtr tmpGrad = nullptr;
+      outputSampleGrad->rotate(tmpGrad, true, false /* anti clock-wise */);
+      inputSampleGrad->add(*tmpGrad);
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/RotateLayer.h
+++ b/paddle/gserver/layers/RotateLayer.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
+ * domain
+ * The rotation is 90 degrees in clock-wise for each channel
+ * \f[
+ *   y(j,i,:) = x(M-i-1,j,:)
+ * \f]
+ * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
+ *
+ * The config file api is rotate_layer
+ *
+*/
+class RotateLayer : public Layer {
+public:
+  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+private:
+  int batchSize_;
+  int size_;
+  int height_;
+  int width_;
+  int channels_;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 /**
- * A layer for transposition.
+ * A layer for transposing a minibatch matrix.
 * \f[
     y = x^\mathrm{T}
 * \f]

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1316,6 +1316,25 @@ TEST(Layer, ResizeLayer) {
  }
 }
+TEST(Layer, RotateLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("rotate");
+  const int CHANNEL = 2;
+  const int HEIGHT = 8;
+  const int WIDTH = 4;
+  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
+  config.layerConfig.set_size(INPUT_SIZE);
+  config.layerConfig.set_height(HEIGHT);
+  config.layerConfig.set_width(WIDTH);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rotate", 100, false, useGpu);
+  }
+}
 TEST(Layer, NCELayer) {
  TestConfig config;
  size_t numClasses = 4;

--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -372,7 +372,7 @@ MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
 }
 /* mem MUST be alloced outside (memAlloc=false) */
-void CpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
  CHECK(!memAlloc);
  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
  if (format_ == SPARSE_CSR) {

--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -201,7 +201,7 @@ public:
  void zeroMem();
  /// mem MUST be alloced outside (memAlloc=false)
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
  void mul(const Matrix& A, const Matrix& B, real alpha, real beta);

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -274,6 +274,18 @@ real GpuMatrix::getSum() {
  return sum;
 }
+real GpuMatrix::getMin() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMin();
+}
+real GpuMatrix::getMax() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMax();
+}
 void GpuMatrix::accumulateColSum(Matrix& src) {
  CHECK_EQ(getWidth(), src.getWidth());
  CHECK_EQ(getHeight(), (size_t)1);
@@ -371,11 +383,13 @@ MatrixPtr GpuMatrix::getTranspose() {
  }
 }
-void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
  if (memAlloc) {
    matTrans = std::make_shared<GpuMatrix>(width_, height_);
  } else {
    CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
  }
  real* dataTrans = matTrans->getData();
  real* data = getData();
@@ -385,13 +399,27 @@ void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
  hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
 }
+void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<GpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+  real* dataRot = matRot->getData();
+  real* data = getData();
+  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
+}
 MatrixPtr GpuMatrix::getInverse() {
  MatrixPtr matInv;
  inverse(matInv, true);
  return matInv;
 }
-void GpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
  CHECK_EQ(height_, width_);
  if (memAlloc) {
@@ -1690,11 +1718,13 @@ MatrixPtr CpuMatrix::getTranspose() {
  }
 }
-void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
  if (memAlloc) {
    matTrans = std::make_shared<CpuMatrix>(width_, height_);
  } else {
    CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
  }
  real* dataTrans = matTrans->getData();
  real* data = getData();
@@ -1708,13 +1738,35 @@ void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
  }
 }
+void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<CpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+  real* dataRot = matRot->getData();
+  real* data = getData();
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      if (clockWise) {
+        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
+      } else {
+        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
+      }
+    }
+  }
+}
 MatrixPtr CpuMatrix::getInverse() {
  MatrixPtr matInv;
  inverse(matInv, true);
  return matInv;
 }
-void CpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
  CHECK_EQ(height_, width_);
  if (memAlloc) {

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -372,7 +372,27 @@ public:
   * allocate matTrans' memory outside, then set memAlloc as false;
   * else set as true.
   */
-  virtual void transpose(MatrixPtr matTrans, bool memAlloc) {
+  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+  /**
+   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
+   *         otherwise rotate in anti clock-wise
+   * clock-wise:
+   * \f[
+   *   y(j,i) = x(M-i-1,j)
+   * \f]
+   * anti clock-wise:
+   * \f[
+   *   y(j,i) = x(i, N-1-j)
+   * \f]
+   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+   *
+   * allocate matRot' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
    LOG(FATAL) << "Not implemented";
  }
@@ -387,7 +407,7 @@ public:
   * if allocate matInv's memory outside, then set memAlloc as false;
   * else set as true.
   */
-  virtual void inverse(MatrixPtr matInv, bool memAlloc) {
+  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
    LOG(FATAL) << "Not implemented";
  }
@@ -1169,11 +1189,15 @@ public:
  void accumulateColSum(Matrix& src);
  real getAbsSum();
+  real getMin();
+  real getMax();
  MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
  MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
+  void inverse(MatrixPtr& matInv, bool memAlloc);
  /// add b to each sample of this.
  void addBias(Matrix& b, real scale);
@@ -1485,10 +1509,11 @@ public:
  real getAbsSum();
  MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
  MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
+  void inverse(MatrixPtr& matInv, bool memAlloc);
  void copyFrom(const Matrix& src);

--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -497,7 +497,7 @@ void GpuSparseMatrix::setRow(size_t row,
 SparseValueType GpuSparseMatrix::getValueType() const { return valueType_; }
-void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void GpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
  CHECK_EQ(format_, SPARSE_CSC);
  int nnz = sMatrix_->nnz;
  if (memAlloc) {

--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -109,7 +109,7 @@ public:
  MatrixPtr getTranspose();
  /// B = A'
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
  void copyFrom(const Matrix& src);
  void copyFrom(const Matrix& src, hl_stream_t stream);

--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -248,11 +248,13 @@ TEST(Matrix, SparseMatrixTranspose) {
            /*dense matrix transpose*/
            CpuMatrixPtr matC(new CpuMatrix(height, width));
            matC->copyFrom(*matA);
-            CpuMatrixPtr matD(new CpuMatrix(width, height));
+            MatrixPtr matD(new CpuMatrix(width, height));
            matC->transpose(matD, false);
            /*check result*/
            checkSMatrixEqual2Dense(
-                std::dynamic_pointer_cast<CpuSparseMatrix>(matB), matD);
+                std::dynamic_pointer_cast<CpuSparseMatrix>(matB),
+                std::dynamic_pointer_cast<CpuMatrix>(matD));
          }
        }
      }

--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -105,6 +105,21 @@ void testMatrixGetSum(int height, int width) {
  EXPECT_LE(fabs(cpuSum - gpuSum), err);
 }
+void testMatrixGetMinMax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  real cpuMin = cpuInput->getMin();
+  real gpuMin = gpuInput->getMin();
+  real cpuMax = cpuInput->getMax();
+  real gpuMax = gpuInput->getMax();
+  EXPECT_EQ(cpuMin, gpuMin);
+  EXPECT_EQ(cpuMax, gpuMax);
+}
 void testMatrixZeroAtOffset(int height, int width) {
  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
@@ -161,11 +176,29 @@ void testMatrixTranspose(int height, int width) {
  cpu->randomizeUniform();
  gpu->copyFrom(*cpu);
  cpu->transpose(cpuT, false);
-  gpu->transpose(gpuT, false);
+  gpu->transpose(gpuT, true);
  TensorCheckEqual(*cpuT, *gpuT);
 }
+void testMatrixRotate(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+  cpu->rotate(cpuR, false, true);
+  gpu->rotate(gpuR, true, true);
+  TensorCheckEqual(*cpuR, *gpuR);
+  cpu->rotate(cpuR, true, false);
+  gpu->rotate(gpuR, false, false);
+  TensorCheckEqual(*cpuR, *gpuR);
+}
 void testMatrixInverse(int height) {
  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
@@ -181,7 +214,7 @@ void testMatrixInverse(int height) {
  cpu->add(*outputCheck);
  gpu->copyFrom(*cpu);
-  cpu->inverse(cpuI, false);
+  cpu->inverse(cpuI, true);
  gpu->inverse(gpuI, false);
  TensorCheckErr(*cpuI, *gpuI);
@@ -200,6 +233,7 @@ TEST(Matrix, unary) {
      testMatrixZeroAtOffset(height, width);
      testMatrixGetSum(height, width);
      testMatrixTranspose(height, width);
+      testMatrixRotate(height, width);
    }
    // inverse
    testMatrixInverse(height);

--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -427,14 +427,14 @@ message LayerConfig {
  // bias size
  optional uint32 bias_size = 48 [default = 0];
-  // this parameter can be used as a user-defined parameter when necessary, 
+  // this parameter can be used as a user-defined parameter when necessary,
  // without changing the proto file.
-  // e.g., when a new layer with a user-defined parameter is implemented, 
+  // e.g., when a new layer with a user-defined parameter is implemented,
  // it can be used to pass that parameter, without modifying the proto file.
  // string type is used for flexibility: different types can be converted
-  // to string and reinterpreted in the user's own layer implementation.  
+  // to string and reinterpreted in the user's own layer implementation.
  optional string user_arg = 49;
  // to indicate rectangle image data
  optional uint64 height = 50;
  optional uint64 width = 51;

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -830,7 +830,6 @@ class Pool(Cfg):
            channels,
            size_x,
            size_y=None,
-            img_width=None,
            start=None,
            stride=None,  # 1 by defalut in protobuf
            stride_y=None,
@@ -1927,8 +1926,8 @@ class BatchNormLayer(LayerBase):
        image_conf = self.config.inputs[0].image_conf
        parse_image(self.inputs[0].image, input_layer.name, image_conf)
-        # Only pass the width and height of input to batch_norm layer 
+        # Only pass the width and height of input to batch_norm layer
-        # when either of it is non-zero. 
+        # when either of it is non-zero.
        if input_layer.width != 0 or input_layer.height != 0:
            self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
                               image_conf.channels, False)
@@ -1968,6 +1967,18 @@ class ResizeLayer(LayerBase):
            'ResizeLayer must have one and only one input')
+@config_layer('rotate')
+class RotateLayer(LayerBase):
+    def __init__(self, name, inputs, height, width, device=None):
+        super(RotateLayer, self).__init__(
+            name, 'rotate', 0, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 1,
+            'RotateLayer must have one and only one input')
+        self.set_layer_height_width(height, width)
+        self.set_layer_size(self.get_input_layer(0).size)
 @config_layer('blockexpand')
 class BlockExpandLayer(LayerBase):
    def __init__(self, name, inputs, **xargs):

--- a/python/paddle/trainer/recurrent_units.py
+++ b/python/paddle/trainer/recurrent_units.py
@@ -15,10 +15,10 @@
 # recurrent_units.py
 # Version 2.0
 #
-# Some recurrent units can be used in recurrent layer group, 
+# Some recurrent units can be used in recurrent layer group,
 #   to use these units, import this module in your config_file:
-#     import trainer.recurrent_units 
+#     import trainer.recurrent_units
-# 
+#
 # The modules in this file are DEPRECATED.
 # If you would like to use lstm/gru
 # please use the functions defined in paddle.trainer_config_helpers.
@@ -29,7 +29,7 @@ from paddle.trainer.config_parser import *
 # long short term memory, can be used in recurrent machine
 # *inputs* must be a list of Projections, for example:
 #   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of 
+# *para_prefix* defines parameter names, if the *para_prefix* of
 #   two LstmRecurrentUnit is same, they share same parameters
 # *out_memory* can be defined outside if it's used outside
 def LstmRecurrentUnit(name,
@@ -197,7 +197,7 @@ def LstmRecurrentLayerGroup(name,
 # gated recurrent unit, can be used in recurrent machine
 # *inputs* should be a list of Projections, for example:
 #   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of 
+# *para_prefix* defines parameter names, if the *para_prefix* of
 #   two GatedRecurrentUnit is same, they share same parameters
 # *out_memory* can be defined outside if it's used outside

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -70,6 +70,7 @@ __all__ = [
    'interpolation_layer',
    'bilinear_interp_layer',
    'trans_layer',
+    'rotate_layer',
    'sum_to_one_norm_layer',
    'get_output_layer',
    'LayerType',
@@ -154,6 +155,7 @@ class LayerType(object):
    POWER_LAYER = 'power'
    SCALING_LAYER = 'scaling'
    TRANS_LAYER = 'trans'
+    ROTATE_LAYER = 'rotate'
    OUT_PROD_LAYER = 'out_prod'
    FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
@@ -1642,7 +1644,7 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 @layer_support()
 def trans_layer(input, name=None, layer_attr=None):
    """
-    A layer for transposition.
+    A layer for transposing a minibatch matrix.
    .. math::
       y = x^\mathrm{T}
@@ -1673,6 +1675,52 @@ def trans_layer(input, name=None, layer_attr=None):
        name, LayerType.TRANS_LAYER, parents=[input], size=input.size)
+@wrap_name_default()
+@layer_support()
+def rotate_layer(input, height, width, name=None, layer_attr=None):
+    """
+    A layer for rotating 90 degrees (clock-wise) for each feature channel,
+    usually used when the input sample is some image or feature map.
+    .. math::
+       y(j,i,:) = x(M-i-1,j,:)
+    where :math:`x` is (M x N x C) input, and :math:`y` is (N x M x C) output.
+    The example usage is:
+    .. code-block:: python
+       rot = rotate_layer(input=layer,
+                          height=100,
+                          width=100)
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param height: The height of the sample matrix
+    :type height: int
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    l = Layer(
+        name=name,
+        height=height,
+        width=width,
+        type=LayerType.ROTATE_LAYER,
+        inputs=[input.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.ROTATE_LAYER,
+        parents=[input],
+        size=l.config.size)
 @wrap_name_default()
 @layer_support()
 def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
@@ -1826,14 +1874,14 @@ def img_conv_layer(input,
                   trans=False,
                   layer_type=None):
    """
-    Convolution layer for image. Paddle can support both square and non-square 
+    Convolution layer for image. Paddle can support both square and non-square
    input currently.
    The details of convolution layer, please refer UFLDL's `convolution
    <http://ufldl.stanford.edu/tutorial/supervised/
    FeatureExtractionUsingConvolution/>`_ .
-    Convolution Transpose (deconv) layer for image. Paddle can support both square 
+    Convolution Transpose (deconv) layer for image. Paddle can support both square
    and non-square input currently.
    The details of convolution transpose layer,
@@ -1892,7 +1940,7 @@ def img_conv_layer(input,
    :param trans: true if it is a convTransLayer, false if it is a convLayer
    :type trans: bool
    :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt", otherwise layer_type 
+                       layer_type has to be "exconvt", otherwise layer_type
                       has to be either "exconv" or "cudnn_conv"
    :type layer_type: String
    :return: LayerOutput object.
@@ -3626,9 +3674,9 @@ def pad_layer(input,
    input data and 3 zeros after the input data in channel dimension.
    pad_h means padding zeros in height dimension. pad_w means padding zeros
    in width dimension.
    For example,
    .. code-block::
      input(2,2,2,3)  = [
@@ -3637,7 +3685,7 @@ def pad_layer(input,
                          [ [[4,3,1], [1,8,7]],
                            [[3,8,9], [2,3,5]] ]
                        ]
      pad_c=[1,1], pad_h=[0,0], pad_w=[0,0]
      output(2,4,2,3) = [
                          [ [[0,0,0], [0,0,0]],
@@ -4746,6 +4794,7 @@ def cross_entropy_with_selfnorm(input,
                                layer_attr=None):
    """
    A loss layer for multi class entropy with selfnorm.
+    Input should be a vector of positive numbers, without normalization.
    .. code-block:: python

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -957,22 +957,22 @@ def simple_gru(input,
    use one complete layer to implement rnn (including simple rnn, gru and lstm)
    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But,
    the multiplication operation :math:`W x_t` is not computed in these layers.
-    See details in their interfaces in layers.py. 
+    See details in their interfaces in layers.py.
    The other implementation is to use an recurrent group which can ensemble a
    series of layers to compute rnn step by step. This way is flexible for
    attenion mechanism or other complex connections.
    - gru_step_layer: only compute rnn by one step. It needs an memory as input
      and can be used in recurrent group.
-    - gru_unit: a wrapper of gru_step_layer with memory. 
+    - gru_unit: a wrapper of gru_step_layer with memory.
    - gru_group: a GRU cell implemented by a combination of multiple layers in
      recurrent group.
-      But :math:`W x_t` is not done in group.  
+      But :math:`W x_t` is not done in group.
    - gru_memory: a GRU cell implemented by one layer, which does same calculation
-      with gru_group and is faster than gru_group. 
+      with gru_group and is faster than gru_group.
-    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and 
+    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and
      gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
-      formula in grumemory. 
+      formula in grumemory.
    The computational speed is that, grumemory is relatively better than
    gru_group, and gru_group is relatively better than simple_gru.

--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -39,6 +39,7 @@ z1 = mixed_layer(
 assert z1.size > 0
 y2 = fc_layer(input=y, size=15)
+z2 = rotate_layer(input=y2, height=5, width=3)
 cos1 = cos_sim(a=x1, b=y1)
 cos3 = cos_sim(a=x1, b=y2, size=3)
@@ -46,7 +47,7 @@ cos3 = cos_sim(a=x1, b=y2, size=3)
 linear_comb = linear_comb_layer(weights=x1, vectors=y2, size=3)
 out = fc_layer(
-    input=[cos1, cos3, linear_comb, z, z1],
+    input=[cos1, cos3, linear_comb, z, z1, z2],
    size=num_classes,
    act=SoftmaxActivation())