提交 84cb542c 编写于 作者: T tensor-tang

use intel openmp to speedup seq2batch when WITH_MKL

上级 480a5446
...@@ -171,12 +171,25 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch, ...@@ -171,12 +171,25 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
hl_sequence2batch_copy( hl_sequence2batch_copy(
batchData, seqData, idxData, seqWidth, batchCount, seq2batch); batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
} else { } else {
for (int i = 0; i < batchCount; ++i) {
if (seq2batch) { if (seq2batch) {
memcpy(batch.rowBuf(i), const int blockMemSize = 8 * 1024;
sequence.rowBuf(idxData[i]), const int blockSize = blockMemSize / sizeof(real);
seqWidth * sizeof(real)); #ifdef PADDLE_USE_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int i = 0; i < batchCount; ++i) {
for (int j = 0; j < seqWidth; j += blockSize) {
memcpy(batch.rowBuf(i) + j,
sequence.rowBuf(idxData[i]) + j,
(j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real)
: blockMemSize);
}
}
} else { } else {
#ifdef PADDLE_USE_MKLML
#pragma omp parallel for
#endif
for (int i = 0; i < batchCount; ++i) {
memcpy(sequence.rowBuf(idxData[i]), memcpy(sequence.rowBuf(idxData[i]),
batch.rowBuf(i), batch.rowBuf(i),
seqWidth * sizeof(real)); seqWidth * sizeof(real));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册