diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 6d8e68df9b39539d389df2cf2cba23dee038021c..0a8d891b081deaa1142a63f952667bfb516c5215 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -50,7 +50,11 @@ if(NOT WITH_PROFILER) endif(NOT WITH_PROFILER) if(NOT CMAKE_CROSSCOMPILING) - if(WITH_AVX AND AVX_FOUND) + if(WITH_AVX AND AVX512F_FOUND) + set(SIMD_FLAG ${AVX512F_FLAG}) + elseif(WITH_AVX AND AVX2_FOUND) + set(SIMD_FLAG ${AVX2_FLAG}) + elseif(WITH_AVX AND AVX_FOUND) set(SIMD_FLAG ${AVX_FLAG}) elseif(SSE3_FOUND) set(SIMD_FLAG ${SSE3_FLAG}) diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 53c2de332ea74b06d1bd6e5bb119cad6af27ed01..3eacf4d86aa0385eddb690d72e85e3384929bb99 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID set(SSE3_FLAG "-msse3") set(AVX_FLAG "-mavx") set(AVX2_FLAG "-mavx2") + set(AVX512F_FLAG "-mavx512f") elseif(MSVC) set(MMX_FLAG "/arch:MMX") set(SSE2_FLAG "/arch:SSE2") @@ -81,5 +82,16 @@ int main() return 0; }" AVX2_FOUND) +# Check AVX512F +set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) +set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m512i a = _mm512_undefined_epi32(); + return 0; +}" AVX512F_FOUND) + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) -mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND) +mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 3f5fab3b382bea97f43e4bc1b2cd436c956ba264..8181897c3d3844bda5574e85a08b2af038fcd664 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -85,6 +85,199 @@ class CRFDecodingOpKernel : public framework::OpKernel { int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); +#ifdef __AVX__ +// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or +// 16 elements per iteration. Then it can implement the parallel processing. +// Only optimize for float type. +#ifdef __AVX512F__ + size_t step_size = 16; +#else + size_t step_size = 8; +#endif + if (std::is_same::value && (tag_num >= step_size)) { + size_t steps = tag_num / step_size; + size_t remain = tag_num % step_size; + int last_offset = static_cast(remain) - static_cast(step_size); + + // Setup the alpha initial value. + size_t i_offset = 0; + for (size_t i = 0; i <= steps; ++i) { +#ifdef __AVX512F__ + // Declare the variable for the content of weights, input and alpha + // values. + __m512 w_content, x_content, alpha_content; + + // Load the relevant data into the variables from un-aligned address. + w_content = _mm512_loadu_ps((const float*)(w + i_offset)); + x_content = _mm512_loadu_ps((const float*)(x + i_offset)); + alpha_content = _mm512_add_ps(w_content, x_content); + + // Save the alpha value. + _mm512_storeu_ps(reinterpret_cast(alpha_value + i_offset), + alpha_content); +#else + // Declare the variable for the content of weights, input and alpha + // values. + __m256 w_content, x_content, alpha_content; + + // Load the relevant data into the variables from un-aligned address. + w_content = _mm256_loadu_ps((const float*)(w + i_offset)); + x_content = _mm256_loadu_ps((const float*)(x + i_offset)); + alpha_content = _mm256_add_ps(w_content, x_content); + + // Save the alpha value. + _mm256_storeu_ps(reinterpret_cast(alpha_value + i_offset), + alpha_content); +#endif + i_offset += step_size; + if (i == steps - 1) { + if (remain > 0) { + i_offset += last_offset; + } else { + break; + } + } + } + + // Use the column-major strategy to get the location of maximum score. + size_t seq_offset = 0; + for (size_t k = 1; k < seq_len; ++k) { + size_t j_offset = 0; + for (size_t j = 0; j <= steps; ++j) { +#ifdef __AVX512F__ + // Initialize the variables of maximum score and location. + __m512 max_score = _mm512_set1_ps(-std::numeric_limits::max()); + __m512i max_j = _mm512_setzero_si512(); +#else + // Initialize the variables of maximum score and location. + __m256 max_score = _mm256_set1_ps(-std::numeric_limits::max()); + __m256i max_j = _mm256_set1_epi32(0); +#endif + // Calculate the offset of transition_weights. + size_t trans_offset = state_trans_base_idx * tag_num + j_offset; + for (size_t i = 0; i < tag_num; ++i) { +#ifdef __AVX512F__ + // Initalize the content of alpha variable with related offset. + __m512 alpha_content = + _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i)); + // Obtain the content of weights from un-aligned address. + __m512 w_content = + _mm512_loadu_ps((const float*)(w + trans_offset)); + + __m512 score_v = _mm512_add_ps(alpha_content, w_content); + + __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); + + // According to the mask value, it update the index of the max_score + // location. + max_j = _mm512_mask_set1_epi32(max_j, mask, i); + + // Update the max_score value. + max_score = _mm512_max_ps(max_score, score_v); +#else + // Initalize the content of alpha variable with related offset. + __m256 alpha_content = _mm256_broadcast_ss( + (const float*)(alpha_value + seq_offset + i)); + // Obtain the content of weights from un-aligned address. + __m256 w_content = + _mm256_loadu_ps((const float*)(w + trans_offset)); + __m256 score_v = _mm256_add_ps(alpha_content, w_content); + + __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS); + +#ifdef __AVX2__ + // According to the mask value, it update the index of the max_score + // location. + max_j = _mm256_or_si256( + _mm256_andnot_si256((__m256i)mask, max_j), + _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i))); +#else + __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); + __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); + __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0); + __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1); + + lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); + hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); + lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); + hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i)); + + lo_max_j = _mm_or_si128(lo_mask, lo_max_j); + hi_max_j = _mm_or_si128(hi_mask, hi_max_j); + + // According to the mask value, it update the index of the max_score + // location. + max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0); + max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1); +#endif + + // Update the max_score value. + max_score = _mm256_max_ps(max_score, score_v); +#endif + trans_offset += tag_num; + } + +#ifdef __AVX512F__ + // Update the alpha and track values. + __m512 x_content = _mm512_loadu_ps( + (const float*)(x + seq_offset + tag_num + j_offset)); + max_score = _mm512_add_ps(max_score, x_content); + _mm512_storeu_ps(reinterpret_cast(alpha_value + seq_offset + + tag_num + j_offset), + max_score); + _mm512_storeu_si512( + reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num + + j_offset), + max_j); +#else + // Update the alpha and track values. + __m256 x_content = _mm256_loadu_ps( + (const float*)(x + seq_offset + tag_num + j_offset)); + max_score = _mm256_add_ps(max_score, x_content); + _mm256_storeu_ps(reinterpret_cast(alpha_value + seq_offset + + tag_num + j_offset), + max_score); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num + + j_offset), + max_j); +#endif + + // Calculate the offset of next step + j_offset += step_size; + if (j == steps - 1) { + if (remain > 0) { + j_offset += last_offset; + } else { + break; + } + } + } + + seq_offset += tag_num; + } + } else { + for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; + + for (size_t k = 1; k < seq_len; ++k) { + for (size_t i = 0; i < tag_num; ++i) { + T max_score = -std::numeric_limits::max(); + int max_j = 0; + for (size_t j = 0; j < tag_num; ++j) { + T score = alpha_value[(k - 1) * tag_num + j] + + w[(j + state_trans_base_idx) * tag_num + i]; + if (score > max_score) { + max_score = score; + max_j = j; + } + } + + alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i]; + track_value[k * tag_num + i] = max_j; + } + } + } +#else for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; for (size_t k = 1; k < seq_len; ++k) { @@ -105,6 +298,7 @@ class CRFDecodingOpKernel : public framework::OpKernel { } } +#endif T max_score = -std::numeric_limits::max(); int max_i = 0; for (size_t i = 0; i < tag_num; ++i) {