未验证 提交 5ac72d95 编写于 作者: T Tao Luo 提交者: GitHub

Merge pull request #6622 from tensor-tang/omp

use Intel OpenMP to speedup seq2batch when WITH_MKL
...@@ -171,12 +171,31 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch, ...@@ -171,12 +171,31 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
hl_sequence2batch_copy( hl_sequence2batch_copy(
batchData, seqData, idxData, seqWidth, batchCount, seq2batch); batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
} else { } else {
for (int i = 0; i < batchCount; ++i) {
if (seq2batch) { if (seq2batch) {
#ifdef PADDLE_USE_MKLML
const int blockMemSize = 8 * 1024;
const int blockSize = blockMemSize / sizeof(real);
#pragma omp parallel for collapse(2)
for (int i = 0; i < batchCount; ++i) {
for (int j = 0; j < seqWidth; j += blockSize) {
memcpy(batch.rowBuf(i) + j,
sequence.rowBuf(idxData[i]) + j,
(j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real)
: blockMemSize);
}
}
#else
for (int i = 0; i < batchCount; ++i) {
memcpy(batch.rowBuf(i), memcpy(batch.rowBuf(i),
sequence.rowBuf(idxData[i]), sequence.rowBuf(idxData[i]),
seqWidth * sizeof(real)); seqWidth * sizeof(real));
}
#endif
} else { } else {
#ifdef PADDLE_USE_MKLML
#pragma omp parallel for
#endif
for (int i = 0; i < batchCount; ++i) {
memcpy(sequence.rowBuf(idxData[i]), memcpy(sequence.rowBuf(idxData[i]),
batch.rowBuf(i), batch.rowBuf(i),
seqWidth * sizeof(real)); seqWidth * sizeof(real));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册