提交 b3ac51ff 编写于 作者: D dangqingqing

GPU implementation of row conv.

上级 a1815867
......@@ -28,6 +28,7 @@ if(WITH_TESTING)
add_simple_unittest(PadOpTest)
add_simple_unittest(MulOpTest)
add_simple_unittest(CosSimOpTest)
add_simple_unittest(RowConvOpTest)
endif()
endif()
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "RowConvOp.h"
#include <iostream>
#include "paddle/math/Vector.h"
namespace paddle {
......@@ -127,10 +128,8 @@ public:
RowConv<Device>(outMat, inMat, wMat, seqId);
}
};
/**
* \brief The backward propagation of padding Function. Remove the elements
* in the padding positions of forward.
* \brief TODO(qingqing)
*
* Argument in this Function:
*/
......@@ -158,7 +157,37 @@ public:
: typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
const auto seqId = in.getSequenceId().vector<int, Device>();
std::cout << "in:" << std::endl;
for (int i = 0; i < inMat.getHeight(); ++i) {
for (int j = 0; j < inMat.getWidth(); ++j) {
std::cout << outGMat.getElement(i, j) << " ";
}
std::cout << std::endl;
}
std::cout << "w:" << std::endl;
for (int i = 0; i < wMat.getHeight(); ++i) {
for (int j = 0; j < wMat.getWidth(); ++j) {
std::cout << wMat.getElement(i, j) << " ";
}
std::cout << std::endl;
}
std::cout << "w:" << std::endl;
for (int i = 0; i < seqId.getSize(); ++i) {
std::cout << seqId.getElement(i) << " ";
}
std::cout << std::endl;
RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
std::cout << std::endl << "out:" << std::endl;
for (int i = 0; i < inGMat.getHeight(); ++i) {
for (int j = 0; j < inGMat.getWidth(); ++j) {
std::cout << inGMat.getElement(i, j) << " ";
}
std::cout << std::endl;
}
}
};
......@@ -166,7 +195,7 @@ REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
#ifndef PADDLE_ONLY_CPU
REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
REGISTER_TYPED_FUNC(RowConvGrad, GPU, PadGradFunc);
REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
#endif
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_base.h"
#include "RowConvOp.h"
namespace paddle {
template<int BLOCK_H, int BLOCK_W>
__global__ void KeRowConv(real* y, const real* x, const real* w,
const int* starts, const int height, const int width,
const int numSeq, const int context) {
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int blky = blockDim.y;
const int gidx = blockIdx.x * blockDim.x;
__shared__ real sw[BLOCK_H][BLOCK_W];
for (int i = tidy; i < context; i += blky) {
sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
}
__syncthreads();
for (int i = 0; i < numSeq; ++i) {
const int start = starts[i];
const int end = starts[i + 1];
const int steps = end - start;
for (int j = tidy; j < steps; j += blky) {
real sum = 0;
int off = (start + j) * width;
for (int t = 0; t < context; ++t) {
if ((start + j + t) < end) {
int xoff = off + t * width;
real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
sum += sw[t][tidx] * xVal;
}
}
if (gidx + tidx < width) {
y[off + gidx + tidx] += sum;
}
}
}
}
__global__ void KeRowConv2(real* y, const real* x, const real* w,
const int* starts, const int height, const int width,
const int numSeq, const int context) {
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int blky = blockDim.y;
const int gidx = blockIdx.x * blockDim.x;
for (int i = 0; i < numSeq; ++i) {
const int start = starts[i];
const int end = starts[i + 1];
const int steps = end - start;
for (int j = tidy; j < steps; j += blky) {
int off = (start + j) * width;
real sum = 0;
for (int t = 0; t < context && (start + j + t) < end; ++t) {
int xoff = off + t * width;
real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
sum += wd * xd;
}
if (gidx + tidx < width) {
y[off + gidx + tidx] += sum;
}
}
}
}
template <>
void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
const GpuMatrix& in,
const GpuMatrix& filter,
const GpuIVector& seq) {
const size_t numSeq = seq.getSize() - 1;
const size_t contextLength = filter.getHeight();
const size_t height = in.getHeight();
const size_t width = in.getWidth();
LOG(INFO) << numSeq;
LOG(INFO) << contextLength;
LOG(INFO) << height;
LOG(INFO) << width;
real* y = out.getData();
const real* x = in.getData();
const real* w = filter.getData();
const int* starts = seq.getData();
dim3 dimBlock(32, 32);
dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
LOG(INFO) << dimGrid.x;
if (contextLength <= 32) {
KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
(y, x, w, starts, height, width, numSeq, contextLength);
} else {
KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
(y, x, w, starts, height, width, numSeq, contextLength);
}
CHECK_SYNC("RowConv");
}
template<int BLOCK_H, int BLOCK_W, int CONTEXT>
__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
const int* starts, const int height, const int width, const int numSeq,
const int context) {
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int blky = blockDim.y;
const int gidx = blockIdx.x * blockDim.x;
__shared__ real sh_x[BLOCK_H][BLOCK_W];
__shared__ real sh_dy[BLOCK_H][BLOCK_W];
__shared__ real sh_dw[CONTEXT][BLOCK_W];
for (int t = tidy; t < context; t += blky) {
sh_dw[t][tidx] = 0.0;
}
__syncthreads();
for (int i = 0; i < numSeq; ++i) {
const int start = starts[i];
const int end = starts[i + 1];
const int steps = end - start;
for (int j = tidy; j < steps; j += BLOCK_H) {
int xoff = gidx + tidx;
int yoff = start + j;
// transpose
sh_x[tidx][tidy] = xoff < width && yoff < end ? x[yoff * width + xoff] : 0.0;
sh_dy[tidx][tidy] = xoff < width && yoff < end ? dy[yoff * width + xoff] : 0.0;
__syncthreads();
for (int t = 0; t < context; t++) {
real val = tidx + t < blockDim.x ? sh_x[tidy][tidx + t] * sh_dy[tidy][tidx]: 0.0;
// warp size and blockDim.x is 32.
for (int offset = 16; offset > 0; offset /= 2) {
val += __shfl_down(val, offset);
}
if (tidx == 0) {
sh_dw[t][tidy] += val;
}
__syncthreads();
}
}
}
for (int t = tidy; t < context && (gidx + tidx) < width; t += blky) {
dw[t * width + gidx + tidx] += sh_dw[t][tidx];
}
}
template<int BLOCK_H, int BLOCK_W>
__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
const int* starts, const int height, const int width, const int numSeq,
const int context) {
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int gidx = blockIdx.x * blockDim.x;
__shared__ real sh_x[BLOCK_H][BLOCK_W];
__shared__ real sh_dy[BLOCK_H][BLOCK_W];
for (int i = 0; i < numSeq; ++i) {
const int start = starts[i];
const int end = starts[i + 1];
const int steps = end - start;
for (int j = 0; j < steps; j += BLOCK_H) {
int xoff = gidx + tidx;
int yoff = start + j;
// transpose
sh_x[tidx][tidy] = xoff < width && yoff < end ? x[yoff * width + xoff] : 0.0;
sh_dy[tidx][tidy] = xoff < width && yoff < end ? dy[yoff * width + xoff] : 0.0;
__syncthreads();
for (int t = 0; t < context; t++) {
real val = tidx + t < blockDim.x ? sh_x[tidy][tidx + t] * sh_dy[tidy][tidx]: 0.0;
// warp size and blockDim.x is 32.
for (int offset = 16; offset > 0; offset /= 2) {
val += __shfl_down(val, offset);
}
if (tidx == 0 && (gidx + tidy) < width) {
dw[t*width + gidx + tidy] += val;
}
}
}
}
}
template<int BLOCK_H, int BLOCK_W>
__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
const int* starts, const int height, const int width, const int numSeq,
const int context) {
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int blky = blockDim.y;
const int gidx = blockIdx.x * blockDim.x;
__shared__ real sw[BLOCK_H][BLOCK_W];
for (int i = tidy; i < context; i += blky) {
sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
}
__syncthreads();
for (int i = 0; i < numSeq; ++i) {
const int start = starts[i];
const int end = starts[i + 1];
const int steps = end - start;
for (int j = tidy; j < steps; j += blky) {
real sum = 0;
int off = (start + j) * width;
for (int t = 0; t < context && (j - t) >= 0; ++t) {
int dyOff = off - t * width;
real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
sum += sw[t][tidx] * dyVal;
}
if (gidx + tidx < width) {
dx[off + gidx + tidx] += sum;
}
}
}
}
__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
const int* starts, const int height, const int width, const int numSeq,
const int context) {
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int blky = blockDim.y;
const int gidx = blockIdx.x * blockDim.x;
for (int i = 0; i < numSeq; ++i) {
const int start = starts[i];
const int end = starts[i + 1];
const int steps = end - start;
for (int j = tidy; j < steps; j += blky) {
real sum = 0;
int off = (start + j) * width;
for (int t = 0; t < context && (j - t) >= 0; ++t) {
int dyOff = off - t * width;
real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
sum += wVal * dyVal;
}
if (gidx + tidx < width) {
dx[off + gidx + tidx] += sum;
}
}
}
}
template <>
void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
const GpuMatrix& in,
const GpuMatrix& filter,
GpuMatrix& inG,
GpuMatrix& filterG,
const GpuIVector& seq) {
const size_t numSeq = seq.getSize() - 1;
const size_t contextLength = filter.getHeight();
const size_t height = in.getHeight();
const size_t width = in.getWidth();
const real* dy = outG.getData();
const real* x = in.getData();
const real* w = filter.getData();
real* dx = inG.getData();
real* dw = filterG.getData();
const int* starts = seq.getData();
dim3 dimBlock(32, 32);
dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
if (contextLength <= 16) {
KeRowConvBwWeight<32, 32, 16>
<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
(dw, x, dy, starts, height, width, numSeq, contextLength);
} else {
KeRowConvBwWeight2<32, 32>
<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
(dw, x, dy, starts, height, width, numSeq, contextLength);
}
dim3 dimBlock2(32, 32);
dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
if (contextLength <= 64) {
KeRowConvBwData<32, 64>
<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
(dx, w, dy, starts, height, width, numSeq, contextLength);
} else {
KeRowConvBwData2
<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
(dx, w, dy, starts, height, width, numSeq, contextLength);
}
CHECK_SYNC("RowConvGrad");
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "FunctionTest.h"
namespace paddle {
void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
FunctionCompare test("RowConv", FuncConfig());
test.addSequence(SequenceIdArg(TensorShape{batchSize}));
test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
ADD_TO);
test.run();
}
void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
FunctionCompare test("RowConvGrad", FuncConfig());
test.addSequence(SequenceIdArg(TensorShape{batchSize}));
test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
ADD_TO);
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}),
ADD_TO);
test.run();
}
TEST(RowConv, real) {
// for (size_t numSamples : {17, 129}) {
// for (size_t dim : {16, 248}) {
// for (size_t context: {3, 7, 65}) {
LOG(INFO) << "===========";
// for (size_t numSamples : {17}) {
// for (size_t dim : {16}) {
// for (size_t context: {3}) {
size_t numSamples = 17;
size_t dim = 16;
size_t context = 3;
LOG(INFO) << " numSamples=" << numSamples << " dim=" << dim
<< " context length=" << context;
testRowConvFw(numSamples, dim, context);
// testRowConvBw(numSamples, dim, context);
// }
// }
// }
}
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册