From b3ac51ff90093aaa1168a3f75b4e931c5b34eb9e Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Sat, 3 Jun 2017 22:48:07 +0800 Subject: [PATCH] GPU implementation of row conv. --- paddle/function/CMakeLists.txt | 1 + paddle/function/RowConvOp.cpp | 37 +++- paddle/function/RowConvOpGpu.cu | 329 ++++++++++++++++++++++++++++++ paddle/function/RowConvOpTest.cpp | 69 +++++++ 4 files changed, 432 insertions(+), 4 deletions(-) create mode 100644 paddle/function/RowConvOpGpu.cu create mode 100644 paddle/function/RowConvOpTest.cpp diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index 233a53709a..1f54ac1231 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -28,6 +28,7 @@ if(WITH_TESTING) add_simple_unittest(PadOpTest) add_simple_unittest(MulOpTest) add_simple_unittest(CosSimOpTest) + add_simple_unittest(RowConvOpTest) endif() endif() diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp index f92b286c69..24b7e3cdff 100644 --- a/paddle/function/RowConvOp.cpp +++ b/paddle/function/RowConvOp.cpp @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "RowConvOp.h" +#include #include "paddle/math/Vector.h" namespace paddle { @@ -127,10 +128,8 @@ public: RowConv(outMat, inMat, wMat, seqId); } }; - /** - * \brief The backward propagation of padding Function. Remove the elements - * in the padding positions of forward. + * \brief TODO(qingqing) * * Argument in this Function: */ @@ -158,7 +157,37 @@ public: : typename Tensor::Matrix(nullptr, 0, 0); const auto seqId = in.getSequenceId().vector(); + std::cout << "in:" << std::endl; + for (int i = 0; i < inMat.getHeight(); ++i) { + for (int j = 0; j < inMat.getWidth(); ++j) { + std::cout << outGMat.getElement(i, j) << " "; + } + std::cout << std::endl; + } + + std::cout << "w:" << std::endl; + for (int i = 0; i < wMat.getHeight(); ++i) { + for (int j = 0; j < wMat.getWidth(); ++j) { + std::cout << wMat.getElement(i, j) << " "; + } + std::cout << std::endl; + } + + std::cout << "w:" << std::endl; + for (int i = 0; i < seqId.getSize(); ++i) { + std::cout << seqId.getElement(i) << " "; + } + std::cout << std::endl; + RowConvGrad(outGMat, inMat, wMat, inGMat, wGMat, seqId); + + std::cout << std::endl << "out:" << std::endl; + for (int i = 0; i < inGMat.getHeight(); ++i) { + for (int j = 0; j < inGMat.getWidth(); ++j) { + std::cout << inGMat.getElement(i, j) << " "; + } + std::cout << std::endl; + } } }; @@ -166,7 +195,7 @@ REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc); REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc); #ifndef PADDLE_ONLY_CPU REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc); -REGISTER_TYPED_FUNC(RowConvGrad, GPU, PadGradFunc); +REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc); #endif } // namespace paddle diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu new file mode 100644 index 0000000000..5b0e065a21 --- /dev/null +++ b/paddle/function/RowConvOpGpu.cu @@ -0,0 +1,329 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "hl_base.h" +#include "RowConvOp.h" + +namespace paddle { + +template +__global__ void KeRowConv(real* y, const real* x, const real* w, + const int* starts, const int height, const int width, + const int numSeq, const int context) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int blky = blockDim.y; + const int gidx = blockIdx.x * blockDim.x; + + __shared__ real sw[BLOCK_H][BLOCK_W]; + + for (int i = tidy; i < context; i += blky) { + sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; + } + + __syncthreads(); + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + for (int j = tidy; j < steps; j += blky) { + real sum = 0; + int off = (start + j) * width; + for (int t = 0; t < context; ++t) { + if ((start + j + t) < end) { + int xoff = off + t * width; + real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0; + sum += sw[t][tidx] * xVal; + } + } + if (gidx + tidx < width) { + y[off + gidx + tidx] += sum; + } + } + } +} + +__global__ void KeRowConv2(real* y, const real* x, const real* w, + const int* starts, const int height, const int width, + const int numSeq, const int context) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int blky = blockDim.y; + const int gidx = blockIdx.x * blockDim.x; + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + for (int j = tidy; j < steps; j += blky) { + int off = (start + j) * width; + real sum = 0; + for (int t = 0; t < context && (start + j + t) < end; ++t) { + int xoff = off + t * width; + real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0; + real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0; + sum += wd * xd; + } + if (gidx + tidx < width) { + y[off + gidx + tidx] += sum; + } + } + } +} + + + +template <> +void RowConv(GpuMatrix& out, + const GpuMatrix& in, + const GpuMatrix& filter, + const GpuIVector& seq) { + const size_t numSeq = seq.getSize() - 1; + const size_t contextLength = filter.getHeight(); + const size_t height = in.getHeight(); + const size_t width = in.getWidth(); + + LOG(INFO) << numSeq; + LOG(INFO) << contextLength; + LOG(INFO) << height; + LOG(INFO) << width; + + real* y = out.getData(); + const real* x = in.getData(); + const real* w = filter.getData(); + const int* starts = seq.getData(); + + dim3 dimBlock(32, 32); + dim3 dimGrid(DIVUP(width, dimBlock.x), 1); + LOG(INFO) << dimGrid.x; + + if (contextLength <= 32) { + KeRowConv<32, 32><<>> + (y, x, w, starts, height, width, numSeq, contextLength); + } else { + KeRowConv2<<>> + (y, x, w, starts, height, width, numSeq, contextLength); + } + CHECK_SYNC("RowConv"); +} + + +template +__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, + const int* starts, const int height, const int width, const int numSeq, + const int context) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int blky = blockDim.y; + const int gidx = blockIdx.x * blockDim.x; + + __shared__ real sh_x[BLOCK_H][BLOCK_W]; + __shared__ real sh_dy[BLOCK_H][BLOCK_W]; + __shared__ real sh_dw[CONTEXT][BLOCK_W]; + + for (int t = tidy; t < context; t += blky) { + sh_dw[t][tidx] = 0.0; + } + __syncthreads(); + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + for (int j = tidy; j < steps; j += BLOCK_H) { + int xoff = gidx + tidx; + int yoff = start + j; + + // transpose + sh_x[tidx][tidy] = xoff < width && yoff < end ? x[yoff * width + xoff] : 0.0; + sh_dy[tidx][tidy] = xoff < width && yoff < end ? dy[yoff * width + xoff] : 0.0; + __syncthreads(); + + for (int t = 0; t < context; t++) { + real val = tidx + t < blockDim.x ? sh_x[tidy][tidx + t] * sh_dy[tidy][tidx]: 0.0; + // warp size and blockDim.x is 32. + for (int offset = 16; offset > 0; offset /= 2) { + val += __shfl_down(val, offset); + } + if (tidx == 0) { + sh_dw[t][tidy] += val; + } + __syncthreads(); + } + } + } + + for (int t = tidy; t < context && (gidx + tidx) < width; t += blky) { + dw[t * width + gidx + tidx] += sh_dw[t][tidx]; + } +} + +template +__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, + const int* starts, const int height, const int width, const int numSeq, + const int context) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int gidx = blockIdx.x * blockDim.x; + + __shared__ real sh_x[BLOCK_H][BLOCK_W]; + __shared__ real sh_dy[BLOCK_H][BLOCK_W]; + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + for (int j = 0; j < steps; j += BLOCK_H) { + int xoff = gidx + tidx; + int yoff = start + j; + + // transpose + sh_x[tidx][tidy] = xoff < width && yoff < end ? x[yoff * width + xoff] : 0.0; + sh_dy[tidx][tidy] = xoff < width && yoff < end ? dy[yoff * width + xoff] : 0.0; + __syncthreads(); + + for (int t = 0; t < context; t++) { + real val = tidx + t < blockDim.x ? sh_x[tidy][tidx + t] * sh_dy[tidy][tidx]: 0.0; + // warp size and blockDim.x is 32. + for (int offset = 16; offset > 0; offset /= 2) { + val += __shfl_down(val, offset); + } + if (tidx == 0 && (gidx + tidy) < width) { + dw[t*width + gidx + tidy] += val; + } + } + } + } +} + +template +__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, + const int* starts, const int height, const int width, const int numSeq, + const int context) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int blky = blockDim.y; + const int gidx = blockIdx.x * blockDim.x; + + __shared__ real sw[BLOCK_H][BLOCK_W]; + + for (int i = tidy; i < context; i += blky) { + sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; + } + + __syncthreads(); + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + for (int j = tidy; j < steps; j += blky) { + real sum = 0; + int off = (start + j) * width; + for (int t = 0; t < context && (j - t) >= 0; ++t) { + int dyOff = off - t * width; + real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0; + sum += sw[t][tidx] * dyVal; + } + if (gidx + tidx < width) { + dx[off + gidx + tidx] += sum; + } + } + } +} + +__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy, + const int* starts, const int height, const int width, const int numSeq, + const int context) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int blky = blockDim.y; + const int gidx = blockIdx.x * blockDim.x; + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + for (int j = tidy; j < steps; j += blky) { + real sum = 0; + int off = (start + j) * width; + for (int t = 0; t < context && (j - t) >= 0; ++t) { + int dyOff = off - t * width; + real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0; + real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0; + sum += wVal * dyVal; + } + if (gidx + tidx < width) { + dx[off + gidx + tidx] += sum; + } + } + } +} + + +template <> +void RowConvGrad(const GpuMatrix& outG, + const GpuMatrix& in, + const GpuMatrix& filter, + GpuMatrix& inG, + GpuMatrix& filterG, + const GpuIVector& seq) { + const size_t numSeq = seq.getSize() - 1; + const size_t contextLength = filter.getHeight(); + const size_t height = in.getHeight(); + const size_t width = in.getWidth(); + + const real* dy = outG.getData(); + const real* x = in.getData(); + const real* w = filter.getData(); + real* dx = inG.getData(); + real* dw = filterG.getData(); + const int* starts = seq.getData(); + + dim3 dimBlock(32, 32); + dim3 dimGrid(DIVUP(width, dimBlock.x), 1); + + if (contextLength <= 16) { + KeRowConvBwWeight<32, 32, 16> + <<>> + (dw, x, dy, starts, height, width, numSeq, contextLength); + } else { + KeRowConvBwWeight2<32, 32> + <<>> + (dw, x, dy, starts, height, width, numSeq, contextLength); + } + + + dim3 dimBlock2(32, 32); + dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1); + if (contextLength <= 64) { + KeRowConvBwData<32, 64> + <<>> + (dx, w, dy, starts, height, width, numSeq, contextLength); + } else { + KeRowConvBwData2 + <<>> + (dx, w, dy, starts, height, width, numSeq, contextLength); + } + + CHECK_SYNC("RowConvGrad"); +} + +} // namespace paddle diff --git a/paddle/function/RowConvOpTest.cpp b/paddle/function/RowConvOpTest.cpp new file mode 100644 index 0000000000..9898df1a97 --- /dev/null +++ b/paddle/function/RowConvOpTest.cpp @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "FunctionTest.h" + +namespace paddle { + +void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) { + FunctionCompare test("RowConv", FuncConfig()); + + test.addSequence(SequenceIdArg(TensorShape{batchSize})); + test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim})); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim})); + + test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}), + ADD_TO); + + test.run(); +} + +void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) { + FunctionCompare test("RowConvGrad", FuncConfig()); + + test.addSequence(SequenceIdArg(TensorShape{batchSize})); + test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim})); + test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim})); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim})); + + test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}), + ADD_TO); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}), + ADD_TO); + + test.run(); +} + +TEST(RowConv, real) { + // for (size_t numSamples : {17, 129}) { + // for (size_t dim : {16, 248}) { + // for (size_t context: {3, 7, 65}) { + LOG(INFO) << "==========="; + // for (size_t numSamples : {17}) { + // for (size_t dim : {16}) { + // for (size_t context: {3}) { + size_t numSamples = 17; + size_t dim = 16; + size_t context = 3; + LOG(INFO) << " numSamples=" << numSamples << " dim=" << dim + << " context length=" << context; + testRowConvFw(numSamples, dim, context); + // testRowConvBw(numSamples, dim, context); + // } + // } + // } +} + +} // namespace paddle -- GitLab