From 84cb542c13905f5113a6e91d86129ec7a1a90e06 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 15 Dec 2017 17:23:03 +0800 Subject: [PATCH] use intel openmp to speedup seq2batch when WITH_MKL --- paddle/gserver/layers/SequenceToBatch.cpp | 25 +++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp index 5fa7b6f4881..7892d475a62 100644 --- a/paddle/gserver/layers/SequenceToBatch.cpp +++ b/paddle/gserver/layers/SequenceToBatch.cpp @@ -171,12 +171,25 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch, hl_sequence2batch_copy( batchData, seqData, idxData, seqWidth, batchCount, seq2batch); } else { - for (int i = 0; i < batchCount; ++i) { - if (seq2batch) { - memcpy(batch.rowBuf(i), - sequence.rowBuf(idxData[i]), - seqWidth * sizeof(real)); - } else { + if (seq2batch) { + const int blockMemSize = 8 * 1024; + const int blockSize = blockMemSize / sizeof(real); +#ifdef PADDLE_USE_MKLML +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < batchCount; ++i) { + for (int j = 0; j < seqWidth; j += blockSize) { + memcpy(batch.rowBuf(i) + j, + sequence.rowBuf(idxData[i]) + j, + (j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real) + : blockMemSize); + } + } + } else { +#ifdef PADDLE_USE_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < batchCount; ++i) { memcpy(sequence.rowBuf(idxData[i]), batch.rowBuf(i), seqWidth * sizeof(real)); -- GitLab