From cc5f0951ec8a83366038f2497133eaad9241fb47 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Fri, 28 Jul 2017 22:38:20 +0800 Subject: [PATCH] Fix bug for WarpCTCLayer. --- paddle/cuda/src/hl_cuda_sequence.cu | 3 +- paddle/math/tests/test_matrixCompare.cpp | 73 ++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index 4f650ce03c..c728219849 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -269,8 +269,7 @@ void hl_sequence2batch_copy_padding(real* batch, int blockDimY = CUDA_BLOCK_SIZE / blockDimX; dim3 threads(blockDimX, blockDimY); - int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) / - CUDA_BLOCK_SIZE; + int gridDimX = (maxSequenceLength + blockDimY - 1)/blockDimY; int gridDimY = numSequences; dim3 grid(gridDimX, gridDimY); diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp index 354f58df39..a0101d3f30 100644 --- a/paddle/math/tests/test_matrixCompare.cpp +++ b/paddle/math/tests/test_matrixCompare.cpp @@ -30,6 +30,8 @@ using namespace std; // NOLINT using autotest::TensorCheckEqual; using autotest::TensorCheckErr; +// clang-format off + void testMatrixMaxSequence(int batchSize, int inputDim) { // forward MatrixPtr cpuInput = std::make_shared(batchSize, inputDim); @@ -1141,4 +1143,75 @@ TEST(CpuMatrix, copyFrom) { TensorCheckEqual(cpu, copy); } +void testBatch2seqPadding(int batchSize, int inputDim) { + MatrixPtr cpuInput = std::make_shared(batchSize, inputDim); + MatrixPtr gpuInput = std::make_shared(batchSize, inputDim); + cpuInput->randomizeUniform(); + gpuInput->copyFrom(*cpuInput); + + IVectorPtr cpuSequence; + generateSequenceStartPositions(batchSize, cpuSequence); + IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true); + gpuSequence->copyFrom(*cpuSequence); + + int newBatchSize = cpuSequence->getSize() - 1; + MatrixPtr cpuOutput = std::make_shared(newBatchSize, inputDim); + MatrixPtr gpuOutput = std::make_shared(newBatchSize, inputDim); + cpuOutput->zero(); + gpuOutput->zero(); + + + size_t maxSeqLen = 0; + size_t numSeq = cpuSequence->getSize() - 1; + maxSeqLen = *std::max_element( + cpuSequence->getData(), cpuSequence->getData() + numSeq); + + MatrixPtr cBatch = std::make_shared(numSeq * maxSeqLen, inputDim); + MatrixPtr gBatch = std::make_shared(numSeq * maxSeqLen, inputDim); + MatrixPtr cCheck = std::make_shared(numSeq * maxSeqLen, inputDim); + + hl_sequence2batch_copy_padding(gBatch->getData(), + gpuInput->getData(), + cpuSequence->getData(), + inputDim, + maxSeqLen, + numSeq, + false, + true); + cCheck->copyFrom(*gBatch); + + // CPU + + int* seqStart = cpuSequence->getData(); + float* batchData = cBatch->getData(); + float* seqData = cpuInput->getData(); + for (size_t i = 0; i < maxSeqLen; i++) { + for (size_t j = 0; j < numSeq; j++) { + size_t sequenceStart = seqStart[j]; + size_t sequenceLength = seqStart[j + 1] - seqStart[j]; + if (i < sequenceLength) { + memcpy(batchData + (i * numSeq + j) * inputDim, + seqData + (sequenceStart + i) * inputDim, + inputDim * sizeof(real)); + } else { + memset(batchData + (i * numSeq + j) * inputDim, + 0, + inputDim * sizeof(real)); + } + } + } + + TensorCheckErr(*cBatch, *cCheck); +} + + +TEST(Matrix, warpCTC) { + for (auto batchSize : {51, 1285, 3884}) { + for (auto inputDim : {32, 512, 3026}) { + VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim; + testBatch2seqPadding(batchSize, inputDim); + } + } +} + #endif -- GitLab