From ea7d4be3f81c6b7a2523e04f4c1f124b97497740 Mon Sep 17 00:00:00 2001
From: Andrew Ryrie <smb314159@gmail.com>
Date: Mon, 29 Nov 2021 21:43:00 +0000
Subject: [PATCH] Merge pull request #20658 from smbz:lstm_optimisation

* dnn: LSTM optimisation

This uses the AVX-optimised fastGEMM1T for matrix multiplications where available, instead of the standard cv::gemm.

fastGEMM1T is already used by the fully-connected layer.  This commit involves two minor modifications:
 - Use unaligned access.  I don't believe this involves any performance hit in on modern CPUs (Nehalem and Bulldozer onwards) in the case where the address is actually aligned.
 - Allow for weight matrices where the number of columns is not a multiple of 8.

I have not enabled AVX-512 as I don't have an AVX-512 CPU to test on.

* Fix warning about initialisation order

* Remove C++11 syntax

* Fix build when AVX(2) is not available

In this case the CV_TRY_X macros are defined to 0, rather than being undefined.

* Minor changes as requested:

 - Don't check hardware support for AVX(2) when dispatch is disabled for these
 - Add braces

* Fix out-of-bounds access in fully connected layer

The old tail handling in fastGEMM1T implicitly rounded vecsize up to the next multiple of 8, and the fully connected layer implements padding up to the next multiple of 8 to cope with this.  The new tail handling does not round the vecsize upwards like this but it does require that the vecsize is at least 8.  To adapt to the new tail handling, the fully connected layer now rounds vecsize itself at the same time as adding the padding(which makes more sense anyway).

This also means that the fully connected layer always passes a vecsize of at least 8 to fastGEMM1T, which fixes the out-of-bounds access problems.

* Improve tail mask handling

 - Use static array for generating tail masks (as requested)
 - Apply tail mask to the weights as well as the input vectors to prevent spurious propagation of NaNs/Infs

* Revert whitespace change

* Improve readability of conditions for using AVX

* dnn(lstm): minor coding style changes, replaced left aligned load
---
 modules/dnn/perf/perf_recurrent.cpp           |  90 +++++++++++++++
 .../dnn/src/layers/fully_connected_layer.cpp  |   6 +-
 modules/dnn/src/layers/layers_common.simd.hpp |  67 +++++++++---
 modules/dnn/src/layers/recurrent_layers.cpp   | 103 +++++++++++++++++-
 4 files changed, 246 insertions(+), 20 deletions(-)
 create mode 100644 modules/dnn/perf/perf_recurrent.cpp
diff --git a/modules/dnn/perf/perf_recurrent.cpp b/modules/dnn/perf/perf_recurrent.cpp
new file mode 100644
index 0000000000..fe2a51886c
--- /dev/null
+++ b/modules/dnn/perf/perf_recurrent.cpp
@@ -0,0 +1,90 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+struct LstmParams {
+    // Batch size
+    int nrSamples;
+
+    // Size of the input vector
+    int inputSize;
+
+    // Size of the internal state vector
+    int hiddenSize;
+
+    // Number of timesteps for the LSTM
+    int nrSteps;
+};
+
+static inline void PrintTo(const LstmParams& params, ::std::ostream* os) {
+    (*os) << "BATCH=" << params.nrSamples
+        << ", IN=" << params.inputSize
+        << ", HIDDEN=" << params.hiddenSize
+        << ", TS=" << params.nrSteps;
+}
+
+static const LstmParams testLstmConfigs[] = {
+    {1, 192, 192, 100},
+    {1, 1024, 192, 100},
+    {1, 64, 192, 100},
+    {1, 192, 512, 100},
+    {64, 192, 192, 2},
+    {64, 1024, 192, 2},
+    {64, 64, 192, 2},
+    {64, 192, 512, 2},
+    {128, 192, 192, 2},
+    {128, 1024, 192, 2},
+    {128, 64, 192, 2},
+    {128, 192, 512, 2}
+};
+
+class Layer_LSTM : public TestBaseWithParam<LstmParams> {};
+
+PERF_TEST_P_(Layer_LSTM, lstm) {
+    const LstmParams& params = GetParam();
+    LayerParams lp;
+    lp.type = "LSTM";
+    lp.name = "testLstm";
+    lp.set("produce_cell_output", false);
+    lp.set("use_timestamp_dim", true);
+
+    Mat weightH(params.hiddenSize * 4, params.hiddenSize, CV_32FC1, cv::Scalar(0));
+    Mat weightX(params.hiddenSize * 4, params.inputSize, CV_32FC1, cv::Scalar(0));
+    Mat bias(params.hiddenSize * 4, 1, CV_32FC1, cv::Scalar(0));
+    Mat hInternal(params.nrSteps, params.hiddenSize, CV_32FC1, cv::Scalar(0));
+    Mat cInternal(params.nrSteps, params.hiddenSize, CV_32FC1, cv::Scalar(0));
+    lp.blobs.push_back(weightH);
+    lp.blobs.push_back(weightX);
+    lp.blobs.push_back(bias);
+    lp.blobs.push_back(hInternal);
+    lp.blobs.push_back(cInternal);
+
+    std::vector<int> inputDims;
+    inputDims.push_back(params.nrSamples);
+    inputDims.push_back(params.nrSteps);
+    inputDims.push_back(params.inputSize);
+    Mat input(inputDims.size(), inputDims.data(), CV_32FC1);
+    input = cv::Scalar(0);
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+    net.setInput(input);
+
+    // Warm up
+    std::vector<Mat> outputs(2);
+    net.forward(outputs, "testLstm");
+
+    TEST_CYCLE()
+    {
+        net.forward(outputs, "testLstm");
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Layer_LSTM, testing::ValuesIn(testLstmConfigs));
+
+} // namespace
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index e25ca5a68f..5acce939f1 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -222,17 +222,17 @@ public:
 
             #if CV_TRY_AVX512_SKX
                 if( useAVX512 )
-                    opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                    opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
                 else
             #endif
             #if CV_TRY_AVX2
                 if( useAVX2 )
-                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
                 else
             #endif
             #if CV_TRY_AVX
                 if( useAVX )
-                    opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                    opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
                 else
             #endif
                 {
diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp
index 706695a7b2..accc644676 100644
--- a/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/modules/dnn/src/layers/layers_common.simd.hpp
@@ -550,13 +550,24 @@ void fastDepthwiseConv( const float* wptr,
     _mm256_zeroupper();
 }
 
+// Used to generate the mask used when calculating tails
+static const uint32_t tailMaskArray[15] = {
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL
+};
+
 // dst = vec * weights^t + bias
+// Requires that vecsize is at least 8 or equal to 0 to avoid memory access problems. Does not require alignment.
 void fastGEMM1T( const float* vec, const float* weights,
                  size_t wstep, const float* bias,
                  float* dst, int nvecs, int vecsize )
 {
     int i = 0;
 
+    CV_Assert(vecsize >= 8 || vecsize == 0);
+
+    __m256 tailMask = _mm256_loadu_ps(reinterpret_cast<const float*>(tailMaskArray) + (vecsize % 8));
+
     for( ; i <= nvecs - 8; i += 8 )
     {
         const float* wptr = weights + i*wstep;
@@ -565,18 +576,36 @@ void fastGEMM1T( const float* vec, const float* weights,
                vs4 = _mm256_setzero_ps(), vs5 = _mm256_setzero_ps(),
                vs6 = _mm256_setzero_ps(), vs7 = _mm256_setzero_ps();
 
-        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        int k = 0;
+        for( ; k <= vecsize-8; k += 8, wptr += 8 )
         {
-            __m256 v = _mm256_load_ps(vec + k);
-
-            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
-            vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
-            vs2 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*2), v, vs2);
-            vs3 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*3), v, vs3);
-            vs4 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*4), v, vs4);
-            vs5 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*5), v, vs5);
-            vs6 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*6), v, vs6);
-            vs7 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*7), v, vs7);
+            __m256 v = _mm256_loadu_ps(vec + k);
+
+            vs0 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr), v, vs0);
+            vs1 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep), v, vs1);
+            vs2 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*2), v, vs2);
+            vs3 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*3), v, vs3);
+            vs4 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*4), v, vs4);
+            vs5 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*5), v, vs5);
+            vs6 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*6), v, vs6);
+            vs7 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr + wstep*7), v, vs7);
+        }
+
+        if (k != vecsize) {
+            // Tail
+            k = vecsize - 8;
+            wptr = weights + i * wstep + k;
+            __m256 v = _mm256_loadu_ps(vec + k);
+            v = _mm256_and_ps(v, tailMask);
+
+            vs0 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr), tailMask), v, vs0);
+            vs1 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep), tailMask), v, vs1);
+            vs2 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 2), tailMask), v, vs2);
+            vs3 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 3), tailMask), v, vs3);
+            vs4 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 4), tailMask), v, vs4);
+            vs5 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 5), tailMask), v, vs5);
+            vs6 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 6), tailMask), v, vs6);
+            vs7 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr + wstep * 7), tailMask), v, vs7);
         }
 
         __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs1), _mm256_hadd_ps(vs2, vs3));
@@ -598,10 +627,20 @@ void fastGEMM1T( const float* vec, const float* weights,
         const float* wptr = weights + i*wstep;
         __m256 vs0 = _mm256_setzero_ps();
 
-        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        int k = 0;
+        for( ; k <= vecsize-8; k += 8, wptr += 8 )
         {
-            __m256 v = _mm256_load_ps(vec + k);
-            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
+            __m256 v = _mm256_loadu_ps(vec + k);
+            vs0 = _mm256_fmadd_ps(_mm256_loadu_ps(wptr), v, vs0);
+        }
+
+        if (k != vecsize) {
+            // Tail
+            k = vecsize - 8;
+            wptr = weights + i * wstep + k;
+            __m256 v = _mm256_loadu_ps(vec + k);
+            v = _mm256_and_ps(v, tailMask);
+            vs0 = _mm256_fmadd_ps(_mm256_and_ps(_mm256_loadu_ps(wptr), tailMask), v, vs0);
         }
 
         __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs0), vs0);
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 9088c13390..21dafa142d 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -46,6 +46,8 @@
 #include <cmath>
 #include <opencv2/dnn/shape_utils.hpp>
 
+#include "layers_common.hpp"
+
 namespace cv
 {
 namespace dnn
@@ -118,10 +120,23 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
     ActivationFunction g_activation;
     ActivationFunction h_activation;
 
+#if CV_TRY_AVX
+    bool useAVX;
+#endif
+#if CV_TRY_AVX2
+    bool useAVX2;
+#endif
+
 public:
 
     LSTMLayerImpl(const LayerParams& params)
         : numTimeStamps(0), numSamples(0)
+#if CV_TRY_AVX
+          , useAVX(checkHardwareSupport(CPU_AVX))
+#endif
+#if CV_TRY_AVX2
+          , useAVX2(checkHardwareSupport(CPU_AVX2))
+#endif
     {
         setParamsFrom(params);
 
@@ -343,6 +358,15 @@ public:
             hOutTs = hOutTs.colRange(i * hOutTs.cols / numDirs, (i + 1) * hOutTs.cols / numDirs);
             Mat cOutTs = produceCellOutput ? output[1].reshape(1, numSamplesTotal) : Mat();
 
+#if CV_TRY_AVX2 || CV_TRY_AVX
+            bool canUseAvx = gates.isContinuous() && bias.isContinuous()
+                && Wx.depth() == CV_32F && gates.depth() == CV_32F
+                && bias.depth() == CV_32F && Wx.cols >= 8;
+            bool canUseAvx_hInternal = hInternal.isContinuous() && gates.isContinuous() && bias.isContinuous()
+                && Wh.depth() == CV_32F && hInternal.depth() == CV_32F && gates.depth() == CV_32F
+                && Wh.cols >= 8;
+#endif
+
             int tsStart, tsEnd, tsInc;
             if (reverse || i == 1) {
                 tsStart = numTimeStamps - 1;
@@ -359,9 +383,82 @@ public:
                 Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
                 Mat xCurr = xTs.rowRange(curRowRange);
 
-                gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
-                gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
-                gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b
+#if CV_TRY_AVX2
+                if (useAVX2 && canUseAvx && xCurr.isContinuous())
+                {
+                    for (int n = 0; n < xCurr.rows; n++) {
+                        opt_AVX2::fastGEMM1T(
+                            xCurr.ptr<float>(n),
+                            Wx.ptr<float>(),
+                            Wx.step1(),
+                            bias.ptr<float>(),
+                            gates.ptr<float>(n),
+                            Wx.rows,
+                            Wx.cols
+                        );
+                    }
+                }
+                else
+#endif
+#if CV_TRY_AVX
+                if (useAVX && canUseAvx && xCurr.isContinuous())
+                {
+                    for (int n = 0; n < xCurr.rows; n++) {
+                        opt_AVX::fastGEMM1T(
+                            xCurr.ptr<float>(n),
+                            Wx.ptr<float>(),
+                            Wx.step1(),
+                            bias.ptr<float>(),
+                            gates.ptr<float>(n),
+                            Wx.rows,
+                            Wx.cols
+                        );
+                    }
+                }
+                else
+#endif
+                {
+                    gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
+                    gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b
+                }
+
+#if CV_TRY_AVX2
+                if (useAVX2 && canUseAvx_hInternal)
+                {
+                    for (int n = 0; n < hInternal.rows; n++) {
+                        opt_AVX2::fastGEMM1T(
+                            hInternal.ptr<float>(n),
+                            Wh.ptr<float>(),
+                            Wh.step1(),
+                            gates.ptr<float>(n),
+                            gates.ptr<float>(n),
+                            Wh.rows,
+                            Wh.cols
+                        );
+                    }
+                }
+                else
+#endif
+#if CV_TRY_AVX
+                if (useAVX && canUseAvx_hInternal)
+                {
+                    for (int n = 0; n < hInternal.rows; n++) {
+                        opt_AVX::fastGEMM1T(
+                            hInternal.ptr<float>(n),
+                            Wh.ptr<float>(),
+                            Wh.step1(),
+                            gates.ptr<float>(n),
+                            gates.ptr<float>(n),
+                            Wh.rows,
+                            Wh.cols
+                        );
+                    }
+                }
+                else
+#endif
+                {
+                    gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
+                }
 
                 Mat gateI = gates.colRange(0*numOut, 1*numOut);
                 Mat gateF = gates.colRange(1*numOut, 2*numOut);
-- 
GitLab