From efbdad059634bef022d4a3f5b00aef6ef8e88ed6 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Thu, 24 Oct 2019 15:35:34 +0800 Subject: [PATCH] make search_compute support avx default (#20779) * make search_compute support avx only * clean search_compute.h * rename sse_axpy to avx_axpy test=develop * update CMakeLists.txt test=develop --- paddle/fluid/operators/CMakeLists.txt | 2 +- .../fluid/operators/match_matrix_tensor_op.cc | 4 +-- paddle/fluid/operators/search_compute.h | 33 +------------------ .../fluid/tests/unittests/CMakeLists.txt | 4 +-- 4 files changed, 6 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 35c4268a7f9..07931de4ffd 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -49,7 +49,7 @@ if (WITH_DISTRIBUTE) endif() SET(OP_ONLY_MKL "") -if (NOT WITH_MKL) +if (NOT WITH_MKL OR NOT WITH_AVX) SET(OP_ONLY_MKL ${OP_ONLY_MKL} match_matrix_tensor_op) SET(OP_ONLY_MKL ${OP_ONLY_MKL} var_conv_2d_op) endif() diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc index e9a645d2e0b..d30f396f9c1 100644 --- a/paddle/fluid/operators/match_matrix_tensor_op.cc +++ b/paddle/fluid/operators/match_matrix_tensor_op.cc @@ -286,8 +286,8 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel { auto* r_data = bottom_r_data + (offset_r[b] + j) * dim_in; auto* r_diff = bottom_r_diff + (offset_r[b] + j) * dim_in; if (diff != 0.0) { - sse_axpy(r_data, l_trans_diff, dim_in, diff); - sse_axpy(l_trans_data, r_diff, dim_in, diff); + avx_axpy(r_data, l_trans_diff, dim_in, diff); + avx_axpy(l_trans_data, r_diff, dim_in, diff); } } } diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h index c795f1e390b..995c85d9ff4 100644 --- a/paddle/fluid/operators/search_compute.h +++ b/paddle/fluid/operators/search_compute.h @@ -73,22 +73,10 @@ void call_gemm_batched(const framework::ExecutionContext& ctx, } } -#ifndef TYPE_USE_FLOAT -#define TYPE_USE_FLOAT -#endif -#ifndef USE_SSE -#define USE_SSE -#endif - -#if defined(TYPE_USE_FLOAT) - #define __m256x __m256 -#define __m128x __m128 static const unsigned int AVX_STEP_SIZE = 8; -static const unsigned int SSE_STEP_SIZE = 4; static const unsigned int AVX_CUT_LEN_MASK = 7U; -static const unsigned int SSE_CUT_LEN_MASK = 3U; #define _mm256_mul_px _mm256_mul_ps #define _mm256_add_px _mm256_add_ps @@ -96,20 +84,11 @@ static const unsigned int SSE_CUT_LEN_MASK = 3U; #define _mm256_store_px _mm256_storeu_ps #define _mm256_broadcast_sx _mm256_broadcast_ss -#define _mm_add_px _mm_add_ps -#define _mm_mul_px _mm_mul_ps -#define _mm_load_px _mm_loadu_ps -#define _mm_store_px _mm_storeu_ps -#define _mm_load1_px _mm_load1_ps - -#endif - template -inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) { +inline void avx_axpy(const T* x, T* y, size_t len, const T alpha) { unsigned int jjj, lll; jjj = lll = 0; -#if defined(USE_AVX) lll = len & ~AVX_CUT_LEN_MASK; __m256x mm_alpha = _mm256_broadcast_sx(&alpha); for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { @@ -119,16 +98,6 @@ inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) { _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)))); } -#elif defined(USE_SSE) - lll = len & ~SSE_CUT_LEN_MASK; - __m128x mm_alpha = _mm_load1_px(&alpha); - for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { - _mm_store_px(y + jjj, - _mm_add_px(_mm_load_px(y + jjj), - _mm_mul_px(mm_alpha, _mm_load_px(x + jjj)))); - } - -#endif for (; jjj < len; jjj++) { y[jjj] += alpha * x[jjj]; } diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 7215e3dde47..f77f018f65b 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -70,10 +70,10 @@ if(NOT WITH_MKLML) list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) endif() -if(NOT WITH_MKL) +if(NOT WITH_MKL OR NOT WITH_AVX) list(REMOVE_ITEM TEST_OPS test_match_matrix_tensor_op) list(REMOVE_ITEM TEST_OPS test_var_conv_2d) -endif(NOT WITH_MKL) +endif() if(WITH_GPU OR NOT WITH_MKLML) # matmul with multiple heads need MKL support -- GitLab