Fix bug for WarpCTCLayer.

cc5f0951 · dangqingqing · 44486b6f · cc5f0951 · cc5f0951
隐藏空白更改
内联并排

Showing with 74 addition and 2 deletion

paddle/cuda/src/hl_cuda_sequence.cu paddle/cuda/src/hl_cuda_sequence.cu +1 -2

paddle/math/tests/test_matrixCompare.cpp paddle/math/tests/test_matrixCompare.cpp +73 -0

未找到文件。
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -269,8 +269,7 @@ void hl_sequence2batch_copy_padding(real* batch,
  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
  dim3 threads(blockDimX, blockDimY);
-  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
+  int gridDimX = (maxSequenceLength + blockDimY - 1)/blockDimY;
-      CUDA_BLOCK_SIZE;
  int gridDimY = numSequences;
  dim3 grid(gridDimX, gridDimY);

--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -30,6 +30,8 @@ using namespace std;     // NOLINT
 using autotest::TensorCheckEqual;
 using autotest::TensorCheckErr;
+// clang-format off
 void testMatrixMaxSequence(int batchSize, int inputDim) {
  // forward
  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
@@ -1141,4 +1143,75 @@ TEST(CpuMatrix, copyFrom) {
  TensorCheckEqual(cpu, copy);
 }
+void testBatch2seqPadding(int batchSize, int inputDim) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+  int newBatchSize = cpuSequence->getSize() - 1;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  size_t maxSeqLen = 0;
+  size_t numSeq = cpuSequence->getSize() - 1;
+  maxSeqLen = *std::max_element(
+      cpuSequence->getData(), cpuSequence->getData() + numSeq);
+  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+  hl_sequence2batch_copy_padding(gBatch->getData(),
+                                 gpuInput->getData(),
+                                 cpuSequence->getData(),
+                                 inputDim,
+                                 maxSeqLen,
+                                 numSeq,
+                                 false,
+                                 true);
+  cCheck->copyFrom(*gBatch);
+  // CPU
+  int* seqStart = cpuSequence->getData();
+  float* batchData = cBatch->getData();
+  float* seqData = cpuInput->getData();
+  for (size_t i = 0; i < maxSeqLen; i++) {
+    for (size_t j = 0; j < numSeq; j++) {
+      size_t sequenceStart = seqStart[j];
+      size_t sequenceLength = seqStart[j + 1] - seqStart[j];
+      if (i < sequenceLength) {
+        memcpy(batchData + (i * numSeq + j) * inputDim,
+               seqData + (sequenceStart + i) * inputDim,
+               inputDim * sizeof(real));
+      } else {
+        memset(batchData + (i * numSeq + j) * inputDim,
+               0,
+               inputDim * sizeof(real));
+      }
+    }
+  }
+  TensorCheckErr(*cBatch, *cCheck);
+}
+TEST(Matrix, warpCTC) {
+  for (auto batchSize : {51, 1285, 3884}) {
+    for (auto inputDim : {32, 512, 3026}) {
+        VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+        testBatch2seqPadding(batchSize, inputDim);
+    }
+  }
+}
 #endif