From 2558c3f15a9d4bd340d0d50492a185231bb2627a Mon Sep 17 00:00:00 2001
From: Haonan <haonanyu@baidu.com>
Date: Wed, 1 Feb 2017 14:37:54 -0800
Subject: [PATCH] revisions according to reviews

---
 paddle/cuda/include/hl_matrix.h           | 12 ++++
 paddle/cuda/include/stub/hl_matrix_stub.h |  4 ++
 paddle/cuda/src/hl_cuda_matrix.cu         | 25 ++++++++
 paddle/gserver/layers/RotateLayer.cpp     | 77 ++++++++++++-----------
 paddle/gserver/layers/RotateLayer.h       | 16 ++---
 paddle/gserver/tests/test_LayerGrad.cpp   |  7 ++-
 paddle/math/Matrix.cpp                    | 24 +++----
 paddle/math/Matrix.h                      | 14 ++++-
 paddle/math/tests/test_matrixCompare.cpp  | 21 ++++++-
 9 files changed, 142 insertions(+), 58 deletions(-)

diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index abd5eb3a0c..40828dd5cc 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -267,4 +267,16 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
                                           const int dimN,
                                           real scale);
 
+/**
+ * @brief  Matrix rotation in 90 degrees
+ *
+ * @param[in]   mat       input matrix (M x N).
+ * @param[out]  matRot    output matrix (N x M).
+ * @param[in]   dimM      input matrix height.
+ * @param[in]   dimN      input matrix width.
+ * @param[in]   clockWise rotation direction
+ */
+extern void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise);
+
 #endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 0b669f6735..1309c5974f 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -106,4 +106,8 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
                                           const int dimM,
                                           const int dimN,
                                           real scale) {}
+
+inline void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise);
+
 #endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 2b4c6f7c39..cd23bd3105 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -840,3 +840,28 @@ void hl_matrix_collect_shared_bias(real* B_d,
       (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
   CHECK_SYNC("hl_matrix_collect_shared_bias failed");
 }
+
+__global__ void keMatrixRotate(real* mat, real* matRot,
+                               int dimM, int dimN, bool clockWise) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < dimM * dimN) {
+        int i = idx / dimN;
+        int j = idx % dimN;
+        if (clockWise) {
+            matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
+        } else {
+            matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
+        }
+    }
+}
+
+void hl_matrix_rotate(real *mat, real* matRot,
+                      int dimM, int dimN, bool clockWise) {
+    CHECK_NOTNULL(mat);
+    CHECK_NOTNULL(matRot);
+    const int threads = 512;
+    const int blocks = DIVUP(dimM * dimN, threads);
+    keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
+            (mat, matRot, dimM, dimN, clockWise);
+    CHECK_SYNC("hl_matrix_rotate failed");
+}
diff --git a/paddle/gserver/layers/RotateLayer.cpp b/paddle/gserver/layers/RotateLayer.cpp
index 269ad43b31..1cdd8366d5 100644
--- a/paddle/gserver/layers/RotateLayer.cpp
+++ b/paddle/gserver/layers/RotateLayer.cpp
@@ -23,7 +23,8 @@ bool RotateLayer::init(const LayerMap& layerMap,
   Layer::init(layerMap, parameterMap);
 
   CHECK_EQ(inputLayers_.size(), 1UL);
-  sampleHeight_ = config_.height();
+  height_ = config_.height();
+  width_ = config_.width();
   return true;
 }
 
@@ -32,26 +33,31 @@ void RotateLayer::forward(PassType passType) {
 
   MatrixPtr input = getInputValue(0);
   batchSize_ = input->getHeight();
-  sampleSize_ = input->getWidth();
-  sampleWidth_ = sampleSize_ / sampleHeight_;
-  CHECK_EQ(sampleSize_ % sampleHeight_, 0);
+  size_ = input->getWidth();
+  CHECK_GE(size_, height_ * width_);
+  CHECK_EQ(size_ % (height_ * width_), 0)
+      << "The input's depth should be an int";
+  channels_ = size_ / (height_ * width_);
 
-  resizeOutput(batchSize_, sampleSize_);
+  resizeOutput(batchSize_, size_);
 
   MatrixPtr outV = getOutputValue();
-
-  for (int b = 0; b < batchSize_; b++) {
-    MatrixPtr inputSample = Matrix::create(input->getData() + b * sampleSize_,
-                                           sampleHeight_,
-                                           sampleWidth_,
-                                           false,
-                                           useGpu_);
-    MatrixPtr outputSample = Matrix::create(outV->getData() + b * sampleSize_,
-                                            sampleWidth_,
-                                            sampleHeight_,
-                                            false,
-                                            useGpu_);
-    inputSample->rotate(outputSample, false, true);
+  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
+    for (int c = 0; c < channels_; c++) {  // for each feat channel
+      MatrixPtr inputSample =
+          Matrix::create(input->getData() + b * size_ + c * height_ * width_,
+                         height_,
+                         width_,
+                         false,
+                         useGpu_);
+      MatrixPtr outputSample =
+          Matrix::create(outV->getData() + b * size_ + c * height_ * width_,
+                         width_,
+                         height_,
+                         false,
+                         useGpu_);
+      inputSample->rotate(outputSample, false, true /* clock-wise */);
+    }
   }
 
   if (getInputGrad(0)) {
@@ -69,23 +75,24 @@ void RotateLayer::backward(const UpdateCallback& callback) {
   // the grad should be rotated in the reverse direction
   MatrixPtr preGrad = getInputGrad(0);
 
-  for (int b = 0; b < batchSize_; b++) {
-    MatrixPtr inputSampleGrad =
-        Matrix::create(preGrad->getData() + b * sampleSize_,
-                       sampleHeight_,
-                       sampleWidth_,
-                       false,
-                       useGpu_);
-    MatrixPtr outputSampleGrad =
-        Matrix::create(outputGrad->getData() + b * sampleSize_,
-                       sampleWidth_,
-                       sampleHeight_,
-                       false,
-                       useGpu_);
-    MatrixPtr tmpGrad =
-        Matrix::create(sampleHeight_, sampleWidth_, false, useGpu_);
-    outputSampleGrad->rotate(tmpGrad, false, false);
-    inputSampleGrad->add(*tmpGrad);
+  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
+    for (int c = 0; c < channels_; c++) {  // for each feat channel
+      MatrixPtr inputSampleGrad =
+          Matrix::create(preGrad->getData() + b * size_ + c * height_ * width_,
+                         height_,
+                         width_,
+                         false,
+                         useGpu_);
+      MatrixPtr outputSampleGrad = Matrix::create(
+          outputGrad->getData() + b * size_ + c * height_ * width_,
+          width_,
+          height_,
+          false,
+          useGpu_);
+      MatrixPtr tmpGrad = nullptr;
+      outputSampleGrad->rotate(tmpGrad, true, false /* anti clock-wise */);
+      inputSampleGrad->add(*tmpGrad);
+    }
   }
 }
 
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
index c8eca7c506..a6f565a7e5 100644
--- a/paddle/gserver/layers/RotateLayer.h
+++ b/paddle/gserver/layers/RotateLayer.h
@@ -19,12 +19,13 @@ limitations under the License. */
 
 namespace paddle {
 /**
- * A layer for rotating an input sample (assume it's a matrix)
- * The rotation is in clock-wise
+ * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
+ * domain
+ * The rotation is 90 degrees in clock-wise
  * \f[
- *   y(j,i) = x(M-i-1,j)
+ *   y(j,i,:) = x(M-i-1,j,:)
  * \f]
- * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+ * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
  *
  * The config file api is rotate_layer
  *
@@ -41,9 +42,10 @@ public:
 
 private:
   int batchSize_;
-  int sampleSize_;
-  int sampleHeight_;
-  int sampleWidth_;
+  int size_;
+  int height_;
+  int width_;
+  int channels_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index f7a6c672de..685d2ca542 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1320,9 +1320,12 @@ TEST(Layer, RotateLayer) {
   TestConfig config;
   config.biasSize = 0;
   config.layerConfig.set_type("rotate");
-  const int INPUT_SIZE = 64;  //  height * width
+  const int INPUT_SIZE = 64;  //  height * width * depth
+  const int HEIGHT = 8;
+  const int WIDTH = 4;
   config.layerConfig.set_size(INPUT_SIZE);
-  config.layerConfig.set_height(32);
+  config.layerConfig.set_height(HEIGHT);
+  config.layerConfig.set_width(WIDTH);
   config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
   config.layerConfig.add_inputs();
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 3c09f010bb..a8b53e2105 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -388,6 +388,8 @@ void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
     matTrans = std::make_shared<GpuMatrix>(width_, height_);
   } else {
     CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
   }
   real* dataTrans = matTrans->getData();
   real* data = getData();
@@ -402,15 +404,13 @@ void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
     matRot = std::make_shared<GpuMatrix>(width_, height_);
   } else {
     CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
   }
 
-  MatrixPtr cpuMat = std::make_shared<CpuMatrix>(height_, width_);
-  cpuMat->copyFrom(*this);
-
-  MatrixPtr cpuMatRot = std::make_shared<CpuMatrix>(width_, height_);
-  cpuMat->rotate(cpuMatRot, false, clockWise);
-
-  matRot->copyFrom(*cpuMatRot);
+  real* dataRot = matRot->getData();
+  real* data = getData();
+  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
 }
 
 MatrixPtr GpuMatrix::getInverse() {
@@ -1723,6 +1723,8 @@ void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
     matTrans = std::make_shared<CpuMatrix>(width_, height_);
   } else {
     CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
   }
   real* dataTrans = matTrans->getData();
   real* data = getData();
@@ -1741,18 +1743,18 @@ void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
     matRot = std::make_shared<CpuMatrix>(width_, height_);
   } else {
     CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
   }
   real* dataRot = matRot->getData();
   real* data = getData();
-  int lda = getStride();
-  int ldc = matRot->getStride();
 
   for (size_t i = 0; i < height_; i++) {
     for (size_t j = 0; j < width_; j++) {
       if (clockWise) {
-        dataRot[j * ldc + i] = data[(height_ - i - 1) * lda + j];
+        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
       } else {
-        dataRot[j * ldc + i] = data[i * lda + (width_ - j - 1)];
+        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
       }
     }
   }
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index d123f827c9..c92c0a272d 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -377,9 +377,19 @@ public:
   }
 
   /**
-   * @brief  rotate clock-wise.
+   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
+   *         otherwise rotate in anti clock-wise
+   * clock-wise:
+   * \f[
+   *   y(j,i) = x(M-i-1,j)
+   * \f]
+   * anti clock-wise:
+   * \f[
+   *   y(j,i) = x(i, N-1-j)
+   * \f]
+   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
    *
-   * allocate matTrans' memory outside, then set memAlloc as false;
+   * allocate matRot' memory outside, then set memAlloc as false;
    * else set as true.
    */
   virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index c062ca1080..e024f2cf1b 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -176,11 +176,29 @@ void testMatrixTranspose(int height, int width) {
   cpu->randomizeUniform();
   gpu->copyFrom(*cpu);
   cpu->transpose(cpuT, false);
-  gpu->transpose(gpuT, false);
+  gpu->transpose(gpuT, true);
 
   TensorCheckEqual(*cpuT, *gpuT);
 }
 
+void testMatrixRotate(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+
+  cpu->rotate(cpuR, false, true);
+  gpu->rotate(gpuR, true, true);
+  TensorCheckEqual(*cpuR, *gpuR);
+
+  cpu->rotate(cpuR, true, false);
+  gpu->rotate(gpuR, false, false);
+  TensorCheckEqual(*cpuR, *gpuR);
+}
+
 void testMatrixInverse(int height) {
   MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
   MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
@@ -215,6 +233,7 @@ TEST(Matrix, unary) {
       testMatrixZeroAtOffset(height, width);
       testMatrixGetSum(height, width);
       testMatrixTranspose(height, width);
+      testMatrixRotate(height, width);
     }
     // inverse
     testMatrixInverse(height);
-- 
GitLab