From b3ac51ff90093aaa1168a3f75b4e931c5b34eb9e Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Sat, 3 Jun 2017 22:48:07 +0800
Subject: [PATCH] GPU implementation of row conv.

---
 paddle/function/CMakeLists.txt    |   1 +
 paddle/function/RowConvOp.cpp     |  37 +++-
 paddle/function/RowConvOpGpu.cu   | 329 ++++++++++++++++++++++++++++++
 paddle/function/RowConvOpTest.cpp |  69 +++++++
 4 files changed, 432 insertions(+), 4 deletions(-)
 create mode 100644 paddle/function/RowConvOpGpu.cu
 create mode 100644 paddle/function/RowConvOpTest.cpp
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 233a53709a8..1f54ac1231c 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -28,6 +28,7 @@ if(WITH_TESTING)
     add_simple_unittest(PadOpTest)
     add_simple_unittest(MulOpTest)
     add_simple_unittest(CosSimOpTest)
+    add_simple_unittest(RowConvOpTest)
 endif()
 endif()
 
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
index f92b286c697..24b7e3cdffe 100644
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "RowConvOp.h"
+#include <iostream>
 #include "paddle/math/Vector.h"
 
 namespace paddle {
@@ -127,10 +128,8 @@ public:
     RowConv<Device>(outMat, inMat, wMat, seqId);
   }
 };
-
 /**
- * \brief The backward propagation of padding Function. Remove the elements
- *        in the padding positions of forward.
+ * \brief TODO(qingqing)
  *
  * Argument in this Function:
  */
@@ -158,7 +157,37 @@ public:
                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
     const auto seqId = in.getSequenceId().vector<int, Device>();
 
+    std::cout << "in:" << std::endl;
+    for (int i = 0; i < inMat.getHeight(); ++i) {
+      for (int j = 0; j < inMat.getWidth(); ++j) {
+        std::cout << outGMat.getElement(i, j) << " ";
+      }
+      std::cout << std::endl;
+    }
+
+    std::cout << "w:" << std::endl;
+    for (int i = 0; i < wMat.getHeight(); ++i) {
+      for (int j = 0; j < wMat.getWidth(); ++j) {
+        std::cout << wMat.getElement(i, j) << " ";
+      }
+      std::cout << std::endl;
+    }
+
+    std::cout << "w:" << std::endl;
+    for (int i = 0; i < seqId.getSize(); ++i) {
+      std::cout << seqId.getElement(i) << " ";
+    }
+    std::cout << std::endl;
+
     RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
+
+    std::cout << std::endl << "out:" << std::endl;
+    for (int i = 0; i < inGMat.getHeight(); ++i) {
+      for (int j = 0; j < inGMat.getWidth(); ++j) {
+        std::cout << inGMat.getElement(i, j) << " ";
+      }
+      std::cout << std::endl;
+    }
   }
 };
 
@@ -166,7 +195,7 @@ REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
 REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
 #ifndef PADDLE_ONLY_CPU
 REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
-REGISTER_TYPED_FUNC(RowConvGrad, GPU, PadGradFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
 #endif
 
 }  // namespace paddle
diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
new file mode 100644
index 00000000000..5b0e065a21e
--- /dev/null
+++ b/paddle/function/RowConvOpGpu.cu
@@ -0,0 +1,329 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "RowConvOp.h"
+
+namespace paddle {
+
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConv(real* y, const real* x,  const real* w,
+    const int* starts, const int height, const int width,
+    const int numSeq, const int context) {
+
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+  }
+  
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context; ++t) {
+        if ((start + j + t) < end) {
+          int xoff = off + t * width;
+          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+          sum += sw[t][tidx] * xVal;
+        }
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConv2(real* y, const real* x,  const real* w,
+    const int* starts, const int height, const int width,
+    const int numSeq, const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      int off = (start + j) * width;
+      real sum = 0;
+      for (int t = 0; t < context && (start + j + t) < end; ++t) {
+        int xoff = off + t * width;
+        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wd * xd;
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+
+
+template <>
+void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  LOG(INFO) << numSeq;
+  LOG(INFO) << contextLength;
+  LOG(INFO) << height;
+  LOG(INFO) << width;
+
+  real* y = out.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+
+  dim3 dimBlock(32, 32);
+  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+  LOG(INFO) << dimGrid.x;
+
+  if (contextLength <= 32) {
+    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      (y, x, w, starts, height, width, numSeq, contextLength);
+  } else {
+    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      (y, x, w, starts, height, width, numSeq, contextLength);
+  }
+  CHECK_SYNC("RowConv");
+}
+
+
+template<int BLOCK_H, int BLOCK_W, int CONTEXT>
+__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_H][BLOCK_W];
+  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
+  __shared__ real sh_dw[CONTEXT][BLOCK_W];
+
+  for (int t = tidy; t < context; t += blky) {
+    sh_dw[t][tidx] = 0.0;
+  }
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] = xoff < width && yoff < end ? x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy] = xoff < width && yoff < end ? dy[yoff * width + xoff] : 0.0;
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        real val = tidx + t < blockDim.x ? sh_x[tidy][tidx + t] * sh_dy[tidy][tidx]: 0.0;
+        // warp size and blockDim.x is 32.
+        for (int offset = 16; offset > 0; offset /= 2) {
+          val += __shfl_down(val, offset);
+        }
+        if (tidx == 0) {
+          sh_dw[t][tidy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+
+  for (int t = tidy; t < context && (gidx + tidx) < width; t += blky) {
+    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
+  }
+}
+
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_H][BLOCK_W];
+  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = 0; j < steps; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] = xoff < width  && yoff < end ? x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy] = xoff < width  && yoff < end ? dy[yoff * width + xoff] : 0.0;
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        real val = tidx + t < blockDim.x ? sh_x[tidy][tidx + t] * sh_dy[tidy][tidx]: 0.0;
+        // warp size and blockDim.x is 32.
+        for (int offset = 16; offset > 0; offset /= 2) {
+          val += __shfl_down(val, offset);
+        }
+        if (tidx == 0 && (gidx + tidy) < width) {
+          dw[t*width + gidx + tidy] += val;
+        }
+      }
+    }
+  }
+}
+
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+  }
+  
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        sum += sw[t][tidx] * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wVal * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+
+template <>
+void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              GpuMatrix& inG,
+                              GpuMatrix& filterG,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  const real* dy = outG.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  real* dx = inG.getData();
+  real* dw = filterG.getData();
+  const int* starts = seq.getData();
+
+  dim3 dimBlock(32, 32);
+  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+  
+  if (contextLength <= 16) { 
+    KeRowConvBwWeight<32, 32, 16>
+      <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      (dw, x, dy, starts, height, width, numSeq, contextLength);
+  } else {
+    KeRowConvBwWeight2<32, 32>
+      <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      (dw, x, dy, starts, height, width, numSeq, contextLength);
+  }
+
+
+  dim3 dimBlock2(32, 32);
+  dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
+  if (contextLength <= 64) {
+    KeRowConvBwData<32, 64>
+      <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+      (dx, w, dy, starts, height, width, numSeq, contextLength);
+  } else {
+    KeRowConvBwData2
+      <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+      (dx, w, dy, starts, height, width, numSeq, contextLength);
+  }
+
+  CHECK_SYNC("RowConvGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/RowConvOpTest.cpp b/paddle/function/RowConvOpTest.cpp
new file mode 100644
index 00000000000..9898df1a974
--- /dev/null
+++ b/paddle/function/RowConvOpTest.cpp
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
+  FunctionCompare test("RowConv", FuncConfig());
+
+  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
+
+  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
+                  ADD_TO);
+
+  test.run();
+}
+
+void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
+  FunctionCompare test("RowConvGrad", FuncConfig());
+
+  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
+
+  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
+                  ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}),
+                  ADD_TO);
+
+  test.run();
+}
+
+TEST(RowConv, real) {
+  // for (size_t numSamples : {17, 129}) {
+  //   for (size_t dim : {16, 248}) {
+  //     for (size_t context: {3, 7, 65}) {
+  LOG(INFO) << "===========";
+  // for (size_t numSamples : {17}) {
+  //  for (size_t dim : {16}) {
+  //    for (size_t context: {3}) {
+  size_t numSamples = 17;
+  size_t dim = 16;
+  size_t context = 3;
+  LOG(INFO) << " numSamples=" << numSamples << " dim=" << dim
+            << " context length=" << context;
+  testRowConvFw(numSamples, dim, context);
+  // testRowConvBw(numSamples, dim, context);
+  //     }
+  //   }
+  // }
+}
+
+}  // namespace paddle
-- 
GitLab