Optimize CRF Decoding with AVX/AVX2/AVX512F instruction (#12767)

* Optimize CRF decoding with AVX/AVX2 instruction * Enable the AVX2 flags for compiling * Clean the code and decrease the count of multiply calculation * Add the support of AVX512 instruction to optimize CRF Decoding * Clean the code * Enable the AVX512f flags for compiling * Clean the code for the invaluable switch * Fixed the issue to check AVX512F status * Clean the code * Add some explanation of the key points

Optimize CRF Decoding with AVX/AVX2/AVX512F instruction (#12767)
* Optimize CRF decoding with AVX/AVX2 instruction * Enable the AVX2 flags for compiling * Clean the code and decrease the count of multiply calculation * Add the support of AVX512 instruction to optimize CRF Decoding * Clean the code * Enable the AVX512f flags for compiling * Clean the code for the invaluable switch * Fixed the issue to check AVX512F status * Clean the code * Add some explanation of the key points
084d4a9e · Yihua Xu · Tao Luo · 00463fdf · 084d4a9e · 084d4a9e
隐藏空白更改
内联并排

Showing with 212 addition and 2 deletion

cmake/configure.cmake cmake/configure.cmake +5 -1

cmake/simd.cmake cmake/simd.cmake +13 -1

paddle/fluid/operators/crf_decoding_op.h paddle/fluid/operators/crf_decoding_op.h +194 -0

未找到文件。
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -50,7 +50,11 @@ if(NOT WITH_PROFILER)
 endif(NOT WITH_PROFILER)
 if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX_FOUND)
+    if(WITH_AVX AND AVX512F_FOUND)
+        set(SIMD_FLAG ${AVX512F_FLAG})
+    elseif(WITH_AVX AND AVX2_FOUND)
+        set(SIMD_FLAG ${AVX2_FLAG})
+    elseif(WITH_AVX AND AVX_FOUND)
        set(SIMD_FLAG ${AVX_FLAG})
    elseif(SSE3_FOUND)
        set(SIMD_FLAG ${SSE3_FLAG})

--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID
    set(SSE3_FLAG "-msse3")
    set(AVX_FLAG "-mavx")
    set(AVX2_FLAG "-mavx2")
+    set(AVX512F_FLAG "-mavx512f")
 elseif(MSVC)
    set(MMX_FLAG "/arch:MMX")
    set(SSE2_FLAG "/arch:SSE2")
@@ -81,5 +82,16 @@ int main()
    return 0;
 }" AVX2_FOUND)
+# Check AVX512F
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m512i a = _mm512_undefined_epi32();
+    return 0;
+}" AVX512F_FOUND)
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
+mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -85,6 +85,199 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
    int* track_value =
        track.mutable_data<int>(emission_dims, platform::CPUPlace());
+#ifdef __AVX__
+// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
+// 16 elements per iteration. Then it can implement the parallel processing.
+// Only optimize for float type.
+#ifdef __AVX512F__
+    size_t step_size = 16;
+#else
+    size_t step_size = 8;
+#endif
+    if (std::is_same<T, float>::value && (tag_num >= step_size)) {
+      size_t steps = tag_num / step_size;
+      size_t remain = tag_num % step_size;
+      int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
+      // Setup the alpha initial value.
+      size_t i_offset = 0;
+      for (size_t i = 0; i <= steps; ++i) {
+#ifdef __AVX512F__
+        // Declare the variable for the content of weights, input and alpha
+        // values.
+        __m512 w_content, x_content, alpha_content;
+        // Load the relevant data into the variables from un-aligned address.
+        w_content = _mm512_loadu_ps((const float*)(w + i_offset));
+        x_content = _mm512_loadu_ps((const float*)(x + i_offset));
+        alpha_content = _mm512_add_ps(w_content, x_content);
+        // Save the alpha value.
+        _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
+                         alpha_content);
+#else
+        // Declare the variable for the content of weights, input and alpha
+        // values.
+        __m256 w_content, x_content, alpha_content;
+        // Load the relevant data into the variables from un-aligned address.
+        w_content = _mm256_loadu_ps((const float*)(w + i_offset));
+        x_content = _mm256_loadu_ps((const float*)(x + i_offset));
+        alpha_content = _mm256_add_ps(w_content, x_content);
+        // Save the alpha value.
+        _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
+                         alpha_content);
+#endif
+        i_offset += step_size;
+        if (i == steps - 1) {
+          if (remain > 0) {
+            i_offset += last_offset;
+          } else {
+            break;
+          }
+        }
+      }
+      // Use the column-major strategy to get the location of maximum score.
+      size_t seq_offset = 0;
+      for (size_t k = 1; k < seq_len; ++k) {
+        size_t j_offset = 0;
+        for (size_t j = 0; j <= steps; ++j) {
+#ifdef __AVX512F__
+          // Initialize the variables of maximum score and location.
+          __m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
+          __m512i max_j = _mm512_setzero_si512();
+#else
+          // Initialize the variables of maximum score and location.
+          __m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
+          __m256i max_j = _mm256_set1_epi32(0);
+#endif
+          // Calculate the offset of transition_weights.
+          size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
+          for (size_t i = 0; i < tag_num; ++i) {
+#ifdef __AVX512F__
+            // Initalize the content of alpha variable with related offset.
+            __m512 alpha_content =
+                _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
+            // Obtain the content of weights from un-aligned address.
+            __m512 w_content =
+                _mm512_loadu_ps((const float*)(w + trans_offset));
+            __m512 score_v = _mm512_add_ps(alpha_content, w_content);
+            __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm512_mask_set1_epi32(max_j, mask, i);
+            // Update the max_score value.
+            max_score = _mm512_max_ps(max_score, score_v);
+#else
+            // Initalize the content of alpha variable with related offset.
+            __m256 alpha_content = _mm256_broadcast_ss(
+                (const float*)(alpha_value + seq_offset + i));
+            // Obtain the content of weights from un-aligned address.
+            __m256 w_content =
+                _mm256_loadu_ps((const float*)(w + trans_offset));
+            __m256 score_v = _mm256_add_ps(alpha_content, w_content);
+            __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
+#ifdef __AVX2__
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm256_or_si256(
+                _mm256_andnot_si256((__m256i)mask, max_j),
+                _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
+#else
+            __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
+            __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
+            __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
+            __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
+            lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
+            hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
+            lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
+            hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
+            lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
+            hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
+            max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
+#endif
+            // Update the max_score value.
+            max_score = _mm256_max_ps(max_score, score_v);
+#endif
+            trans_offset += tag_num;
+          }
+#ifdef __AVX512F__
+          // Update the alpha and track values.
+          __m512 x_content = _mm512_loadu_ps(
+              (const float*)(x + seq_offset + tag_num + j_offset));
+          max_score = _mm512_add_ps(max_score, x_content);
+          _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
+                                                    tag_num + j_offset),
+                           max_score);
+          _mm512_storeu_si512(
+              reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
+                                         j_offset),
+              max_j);
+#else
+          // Update the alpha and track values.
+          __m256 x_content = _mm256_loadu_ps(
+              (const float*)(x + seq_offset + tag_num + j_offset));
+          max_score = _mm256_add_ps(max_score, x_content);
+          _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
+                                                    tag_num + j_offset),
+                           max_score);
+          _mm256_storeu_si256(
+              reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
+                                         j_offset),
+              max_j);
+#endif
+          // Calculate the offset of next step
+          j_offset += step_size;
+          if (j == steps - 1) {
+            if (remain > 0) {
+              j_offset += last_offset;
+            } else {
+              break;
+            }
+          }
+        }
+        seq_offset += tag_num;
+      }
+    } else {
+      for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
+      for (size_t k = 1; k < seq_len; ++k) {
+        for (size_t i = 0; i < tag_num; ++i) {
+          T max_score = -std::numeric_limits<T>::max();
+          int max_j = 0;
+          for (size_t j = 0; j < tag_num; ++j) {
+            T score = alpha_value[(k - 1) * tag_num + j] +
+                      w[(j + state_trans_base_idx) * tag_num + i];
+            if (score > max_score) {
+              max_score = score;
+              max_j = j;
+            }
+          }
+          alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
+          track_value[k * tag_num + i] = max_j;
+        }
+      }
+    }
+#else
    for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
    for (size_t k = 1; k < seq_len; ++k) {
@@ -105,6 +298,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
      }
    }
+#endif
    T max_score = -std::numeric_limits<T>::max();
    int max_i = 0;
    for (size_t i = 0; i < tag_num; ++i) {