diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp index 5fa7b6f4881b9582b540a5b1bfe849220cc2a4ea..7892d475a62c32b9763bfb999d176dcef500ebb2 100644 --- a/paddle/gserver/layers/SequenceToBatch.cpp +++ b/paddle/gserver/layers/SequenceToBatch.cpp @@ -171,12 +171,25 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch, hl_sequence2batch_copy( batchData, seqData, idxData, seqWidth, batchCount, seq2batch); } else { - for (int i = 0; i < batchCount; ++i) { - if (seq2batch) { - memcpy(batch.rowBuf(i), - sequence.rowBuf(idxData[i]), - seqWidth * sizeof(real)); - } else { + if (seq2batch) { + const int blockMemSize = 8 * 1024; + const int blockSize = blockMemSize / sizeof(real); +#ifdef PADDLE_USE_MKLML +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < batchCount; ++i) { + for (int j = 0; j < seqWidth; j += blockSize) { + memcpy(batch.rowBuf(i) + j, + sequence.rowBuf(idxData[i]) + j, + (j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real) + : blockMemSize); + } + } + } else { +#ifdef PADDLE_USE_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < batchCount; ++i) { memcpy(sequence.rowBuf(idxData[i]), batch.rowBuf(i), seqWidth * sizeof(real));