diff --git a/paddle/fluid/operators/jit/README.md b/paddle/fluid/operators/jit/README.md index ce31f18b63c9aa6b3a9134f184890abdf0a3beb1..1264bc96ee6e79721a285ab0325f08f3c9654f59 100644 --- a/paddle/fluid/operators/jit/README.md +++ b/paddle/fluid/operators/jit/README.md @@ -19,6 +19,10 @@ PaddlePaddle/Paddle/paddle/fluid/ │ ├── ... │ ├── mkl/ │ │ └── ... + │ ├── mkldnn/ + │ │ └── ... + │ ├── intrinsic/ + │ │ └── ... │ └── openblas/ │ └── ... └── refer/ diff --git a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt index c4a50138636a377d1fbbe14bfa6fd915717b4223..de83d80e7757ad161c810bf17f456d143f3fe597 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt @@ -6,4 +6,3 @@ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE) # use mkl kernels by name and type USE_JITKERNEL_MORE(crfdecoding, intrinsic) -USE_JITKERNEL_MORE(layernorm, intrinsic) diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc index 016fca38686f7682a6b65a94fa44953f26738aec..17b5eaf13dfe69539fa8965e85f46d78bfd54f25 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc @@ -13,7 +13,7 @@ * limitations under the License. */ #include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h" -#include "paddle/fluid/operators/jit/refer/refer.h" +#include #include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/platform/cpu_info.h" @@ -21,118 +21,151 @@ namespace paddle { namespace operators { namespace jit { namespace more { -namespace mkl { - -template <> -void VMul(const float* x, const float* y, float* z, int n) { - platform::dynload::vsMul(n, x, y, z); -} - -template <> -void VMul(const double* x, const double* y, double* z, int n) { - platform::dynload::vdMul(n, x, y, z); -} - -template <> -void VAdd(const float* x, const float* y, float* z, int n) { - platform::dynload::vsAdd(n, x, y, z); -} - -template <> -void VAdd(const double* x, const double* y, double* z, int n) { - platform::dynload::vdAdd(n, x, y, z); -} - -template <> -void VScal(const float* a, const float* x, float* y, int n) { - if (x == y) { - platform::dynload::cblas_sscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); +namespace intrinsic { + +void CRFDecoding(const int seq_len, const float* x, const float* w, + float* alpha, int* track, int tag_num) { + const int step_size = + platform::MayIUse(platform::avx512f) ? ZMM_FLOAT_BLOCK : YMM_FLOAT_BLOCK; + const int end = tag_num / step_size; + const int rest = tag_num % step_size; + /* Setup the alpha initial value.*/ + int i_offset = 0; + int last_offset = rest - step_size; + for (int i = 0; i <= end; ++i) { +#ifdef __AVX512F__ + // Declare the variable for the content of weights, input and alpha values. + __m512 w_content, x_content, alpha_content; + // Load the relevant data into the variables from un-aligned address. + w_content = _mm512_loadu_ps(w + i_offset); + x_content = _mm512_loadu_ps(x + i_offset); + alpha_content = _mm512_add_ps(w_content, x_content); + // Save the alpha value. + _mm512_storeu_ps(alpha_value + i_offset, alpha_content); +#else + // AVX or AVX2 + // weights, input and alpha values. + __m256 w_content, x_content, alpha_content; + // Load the relevant data into the variables from un-aligned address. + w_content = _mm256_loadu_ps(w + i_offset); + x_content = _mm256_loadu_ps(x + i_offset); + alpha_content = _mm256_add_ps(w_content, x_content); + _mm256_storeu_ps(alpha + i_offset, alpha_content); +#endif + i_offset += step_size; + if (i == end - 1) { + if (rest > 0) { + i_offset += last_offset; + } else { + break; + } + } } -} - -template <> -void VScal(const double* a, const double* x, double* y, int n) { - if (x == y) { - platform::dynload::cblas_dscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); + // Use the column-major strategy to get the location of maximum score. + int seq_offset = 0; + constexpr int state_trans_base_idx = 2; + for (int k = 1; k < seq_len; ++k) { + int j_offset = 0; + for (int j = 0; j <= end; ++j) { +/* Initialize the variables of maximum score and location.*/ +#ifdef __AVX512F__ + __m512 max_score = _mm512_set1_ps(-std::numeric_limits::max()); + __m512i max_j = _mm512_setzero_si512(); +#else + __m256 max_score = _mm256_set1_ps(-std::numeric_limits::max()); + __m256i max_j = _mm256_set1_epi32(0); +#endif + /* Calculate the offset of transition_weights.*/ + int trans_offset = state_trans_base_idx * tag_num + j_offset; + for (int i = 0; i < tag_num; ++i) { +/* Initalize the content of alpha variable with related offset.*/ +#ifdef __AVX512F__ + __m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i)); + /* Obtain the content of weights from un-aligned address.*/ + __m512 w_content = _mm512_loadu_ps(w + trans_offset); + __m512 score_v = _mm512_add_ps(alpha_content, w_content); + __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); + /* AVX512 instructions.*/ + max_j = _mm512_mask_set1_epi32(max_j, mask, i); + /* Update the max_score value.*/ + max_score = _mm512_max_ps(max_score, score_v); + +#else + __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i); + /* Obtain the content of weights from un-aligned address.*/ + __m256 w_content = _mm256_loadu_ps(w + trans_offset); + __m256 score_v = _mm256_add_ps(alpha_content, w_content); + __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS); +/* According to the mask value, update the index of the max_score.*/ +#ifdef __AVX2__ + max_j = _mm256_or_si256( + _mm256_andnot_si256((__m256i)mask, max_j), + _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i))); +#else + __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); + __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); + __m128i lo_mask = + _mm256_extractf128_si256(*(__m256i*)&mask, 0); // NOLINT + __m128i hi_mask = + _mm256_extractf128_si256(*(__m256i*)&mask, 1); // NOLINT + lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); + hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); + lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); + hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i)); + lo_max_j = _mm_or_si128(lo_mask, lo_max_j); + hi_max_j = _mm_or_si128(hi_mask, hi_max_j); + max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0); + max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1); +#endif + /* Update the max_score value.*/ + max_score = _mm256_max_ps(max_score, score_v); + +#endif + + trans_offset += tag_num; + } +/* Update the alpha and track values. */ +#ifdef __AVX512F__ + __m512 x_content = + _mm512_loadu_ps(x + seq_offset + this->num_ + j_offset); + max_score = _mm512_add_ps(max_score, x_content); + _mm512_storeu_ps(alpha + seq_offset + this->num_ + j_offset, max_score); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(track + seq_offset + + this->num_ + j_offset), + max_j); +#else + __m256 x_content = _mm256_loadu_ps(x + seq_offset + tag_num + j_offset); + max_score = _mm256_add_ps(max_score, x_content); + _mm256_storeu_ps(alpha + seq_offset + tag_num + j_offset, max_score); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(track + seq_offset + tag_num + j_offset), + max_j); +#endif + + /* Calculate the offset of next step*/ + j_offset += step_size; + if (j == end - 1) { + if (rest > 0) { + j_offset += last_offset; + } else { + break; + } + } + } + seq_offset += tag_num; } } -template <> -void VExp(const float* x, float* y, int n) { - platform::dynload::vsExp(n, x, y); -} - -template <> -void VExp(const double* x, double* y, int n) { - platform::dynload::vdExp(n, x, y); -} - -// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 -template <> -bool VMulKernel::UseMe(int d) const { - return platform::MayIUse(platform::avx512f) && d > 512; +bool CRFDecodingKernel::UseMe(int d) const { + return platform::MayIUse(platform::avx); } -template <> -bool VAddKernel::UseMe(int d) const { - return platform::MayIUse(platform::avx512f) && d > 512; -} - -template <> -bool VScalKernel::UseMe(int d) const { - return platform::MayIUse(platform::avx512f) && d > 512; -} - -template <> -bool VExpKernel::UseMe(int d) const { - return d > 7; -} - -template <> -bool VSigmoidKernel::UseMe(int d) const { - return d > 7; -} - -template <> -bool VTanhKernel::UseMe(int d) const { - return d > 7; -} - -#define AWALYS_USE_ME_WITH_DOUBLE(func) \ - template <> \ - bool func##Kernel::UseMe(int d) const { \ - return true; \ - } - -AWALYS_USE_ME_WITH_DOUBLE(VMul); -AWALYS_USE_ME_WITH_DOUBLE(VAdd); -AWALYS_USE_ME_WITH_DOUBLE(VScal); -AWALYS_USE_ME_WITH_DOUBLE(VExp); -AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); -AWALYS_USE_ME_WITH_DOUBLE(VTanh); - -#undef AWALYS_USE_ME_WITH_DOUBLE -} // namespace mkl +} // namespace intrinsic } // namespace more } // namespace jit } // namespace operators } // namespace paddle -namespace mkl = paddle::operators::jit::more::mkl; - -#define REGISTER_MKL_KERNEL(key, func) \ - REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel, \ - mkl::func##Kernel) - -REGISTER_MKL_KERNEL(vmul, VMul); -REGISTER_MKL_KERNEL(vadd, VAdd); -REGISTER_MKL_KERNEL(vscal, VScal); -REGISTER_MKL_KERNEL(vexp, VExp); -REGISTER_MKL_KERNEL(vsigmoid, VSigmoid); -REGISTER_MKL_KERNEL(vtanh, VTanh); +namespace intrinsic = paddle::operators::jit::more::intrinsic; -#undef REGISTER_MKL_KERNEL +REGISTER_JITKERNEL_MORE(crfdecoding, intrinsic, intrinsic::CRFDecodingKernel); diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h index bf209d2f9d23dbd1376b0ea2e2ac007280dded4c..a4081cfc34b0d5c59456c8d3aafce06790b5b060 100644 --- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h +++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h @@ -21,68 +21,18 @@ namespace paddle { namespace operators { namespace jit { namespace more { -namespace mkl { +namespace intrinsic { -template -void VMul(const T* x, const T* y, T* z, int n); +void CRFDecoding(const int seq_len, const float* x, const float* w, + float* alpha, int* track, int tag_num); -template -void VAdd(const T* x, const T* y, T* z, int n); +class CRFDecodingKernel : public KernelImpl> { + public: + CRFDecodingKernel() { this->func = CRFDecoding; } + bool UseMe(typename CRFDecodingTuples::attr_type) const override; +}; -template -void VScal(const T* a, const T* x, T* y, int n); - -template -void VExp(const T* x, T* y, int n); - -template -void VSigmoid(const T* x, T* y, int n) { - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(0) - y[i]; - } - VExp(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(1) / (static_cast(1) + y[i]); - } -} - -template -void VTanh(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoid(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} - -#define DECLARE_MKL_KERNEL(name, tuples) \ - template \ - class name##Kernel : public KernelImpl> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool UseMe(typename tuples::attr_type) const override; \ - } - -// XYZN -DECLARE_MKL_KERNEL(VMul, XYZNTuples); -DECLARE_MKL_KERNEL(VAdd, XYZNTuples); - -// AXYN -DECLARE_MKL_KERNEL(VScal, AXYNTuples); - -// XYN -DECLARE_MKL_KERNEL(VExp, XYNTuples); -DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); -DECLARE_MKL_KERNEL(VTanh, XYNTuples); - -#undef DECLARE_MKL_KERNEL - -} // namespace mkl +} // namespace intrinsic } // namespace more } // namespace jit } // namespace operators