bugfix of FMA port to FAST_FLOAT

8 float FPs fit in a single 256bit vector (8x32) (contrasting 4 double FPs: 4*64) [sw] Format commit message and use float instead of TFloat

bugfix of FMA port to FAST_FLOAT
8 float FPs fit in a single 256bit vector (8x32) (contrasting 4 double FPs: 4*64) [sw] Format commit message and use float instead of TFloat
24a29b79 · Ger Hobbelt · Stefan Weil · 472f5d90 · 24a29b79
隐藏空白更改
内联并排

Showing with 8 addition and 8 deletion

src/arch/dotproductfma.cpp src/arch/dotproductfma.cpp +8 -8

未找到文件。
--- a/src/arch/dotproductfma.cpp
+++ b/src/arch/dotproductfma.cpp
@@ -31,26 +31,26 @@ namespace tesseract {
 // Uses Intel FMA intrinsics to access the SIMD instruction set.
 #if defined(FAST_FLOAT)
 float DotProductFMA(const float *u, const float *v, int n) {
-  const unsigned quot = n / 8;
-  const unsigned rem = n % 8;
+  const unsigned quot = n / 16;
+  const unsigned rem = n % 16;
  __m256 t0 = _mm256_setzero_ps();
  __m256 t1 = _mm256_setzero_ps();
  for (unsigned k = 0; k < quot; k++) {
    __m256 f0 = _mm256_loadu_ps(u);
    __m256 f1 = _mm256_loadu_ps(v);
    t0 = _mm256_fmadd_ps(f0, f1, t0);
-    u += 4;
-    v += 4;
+    u += 8;
+    v += 8;
    __m256 f2 = _mm256_loadu_ps(u);
    __m256 f3 = _mm256_loadu_ps(v);
    t1 = _mm256_fmadd_ps(f2, f3, t1);
-    u += 4;
-    v += 4;
+    u += 8;
+    v += 8;
  }
  t0 = _mm256_hadd_ps(t0, t1);
-  alignas(32) float tmp[4];
+  alignas(32) float tmp[8];
  _mm256_store_ps(tmp, t0);
-  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
  for (unsigned k = 0; k < rem; k++) {
    result += *u++ * *v++;
  }