use prefetch to load next mem into cache (#21206)

* use prefetch to load next mem into cache test=develop * remove hard code memcpy om pyramid_hash_ff test=develop

use prefetch to load next mem into cache (#21206)
* use prefetch to load next mem into cache test=develop * remove hard code memcpy om pyramid_hash_ff test=develop
b19e1a1b · Leo Zhao · Tao Luo · 691ced87 · b19e1a1b
隐藏空白更改
内联并排

Showing with 13 addition and 6 deletion

paddle/fluid/operators/pyramid_hash_op.cc paddle/fluid/operators/pyramid_hash_op.cc +13 -6

未找到文件。
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
@@ -161,14 +161,21 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
  void hash_embedding_ff(const T* hash_id, int len, T* top_pos,
                         const T* weights, int _num_emb, int _rand_len,
                         int _space_len) const {
+    unsigned int pos1 = XXH32(hash_id, len * sizeof(T), 0) % _space_len;
+    unsigned int pos2 = XXH32(hash_id, len * sizeof(T), _rand_len) % _space_len;
+
    for (unsigned int j = 0; j != _num_emb; j += _rand_len) {
-      unsigned int pos = XXH32(hash_id, len * sizeof(T), j) % _space_len;
-      if (_rand_len == 16) {
-        memcpy(top_pos + j, const_cast<float*>(weights + pos), 16 * sizeof(T));
-      } else {
-        memcpy(top_pos + j, const_cast<float*>(weights + pos),
-               _rand_len * sizeof(T));
+      if (j + _rand_len < _num_emb) {
+        __builtin_prefetch(weights + pos2);
+        __builtin_prefetch(top_pos + j + _rand_len);
      }
+
+      unsigned int pos3 =
+          XXH32(hash_id, len * sizeof(T), j + 2 * _rand_len) % _space_len;
+      memcpy(top_pos + j, const_cast<float*>(weights + pos1),
+             _rand_len * sizeof(T));
+      pos1 = pos2;
+      pos2 = pos3;
    }
  }