diff --git a/CMakeLists.txt b/CMakeLists.txt index f03cda950a927ff42b1241d4c2a07f0aa29fa43f..24ff21b60c5e6e92452aca1ebda17eff3aaee621 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,7 @@ find_package(Git REQUIRED) find_package(Threads REQUIRED) include(system) +include(simd) ################################ Configurations ####################################### option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) @@ -64,7 +65,6 @@ include(external/openblas) # download, build, install openblas include(external/swig) # download, build, install swig include(external/warpctc) # download, build, install warpctc -include(simd) # set simd flag include(package) # set paddle packages include(cpplint) # set paddle c++ style include(ccache) # set ccache for compilation diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 6f450c61291b6503629806c10010c96e3b8f13f4..994ed28ba5fef77e74680e587b0b5f0dd71d23f9 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -32,6 +32,16 @@ if(NOT WITH_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER) endif(NOT WITH_PROFILER) +if(NEON_FOUND) + set(SIMD_FLAG ${NEON_FLAG}) +else(NEON_FOUND) + if(WITH_AVX) + set(SIMD_FLAG ${AVX_FLAG}) + else(WITH_AVX) + set(SIMD_FLAG ${SSE3_FLAG}) + endif(WITH_AVX) +endif(NEON_FOUND) + if(NOT WITH_GPU) add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DHPPL_STUB_FUNC) @@ -48,11 +58,7 @@ else() message(FATAL_ERROR "Paddle need cudnn to compile") endif() - if(WITH_AVX) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}") - else(WITH_AVX) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}") - endif(WITH_AVX) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") # Include cuda and cudnn include_directories(${CUDNN_INCLUDE_DIR}) diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 4e4c4e6e437339d9a5a3abde4ea2d27580b9e88b..31776e5f7f08fa5a0945b872f707a1cd0995214d 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -88,14 +88,5 @@ int main() return 0; }" NEON_FOUND) -if(NEON_FOUND) - set(SIMD_FLAG ${NEON_FLAG}) -else(NEON_FOUND) - if(WITH_AVX) - set(SIMD_FLAG ${AVX_FLAG}) - else(WITH_AVX) - set(SIMD_FLAG ${SSE3_FLAG}) - endif(WITH_AVX) -endif(NEON_FOUND) - +set(CMAKE_REQUIRED_FLAGS "") mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND NEON_FOUND) diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp index 60d37cef4000e9bd9ea78e5d72f065dadb4b4b61..d66d543a61450b47b7758b50eaecc107c6fe3576 100644 --- a/paddle/math/SIMDFunctions.cpp +++ b/paddle/math/SIMDFunctions.cpp @@ -13,122 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "SIMDFunctions.h" -#ifdef __SSE__ +#ifdef __SSE3__ #include #endif #include -#ifdef __SSE__ -static void addto_sse(float* a, const float* b, size_t len) { - int offset = len % 16; - __m128 ma0, ma1, ma2, ma3; - __m128 mb0, mb1, mb2, mb3; - - for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) { - ma0 = _mm_load_ps(a); - ma1 = _mm_load_ps(a + 4); - ma2 = _mm_load_ps(a + 8); - ma3 = _mm_load_ps(a + 12); - - mb0 = _mm_load_ps(b); - mb1 = _mm_load_ps(b + 4); - mb2 = _mm_load_ps(b + 8); - mb3 = _mm_load_ps(b + 12); - - ma0 = _mm_add_ps(ma0, mb0); - ma1 = _mm_add_ps(ma1, mb1); - ma2 = _mm_add_ps(ma2, mb2); - ma3 = _mm_add_ps(ma3, mb3); - - _mm_store_ps(a, ma0); - _mm_store_ps(a + 4, ma1); - _mm_store_ps(a + 8, ma2); - _mm_store_ps(a + 12, ma3); - } - - for (int i = 0; i < offset; i++) a[i] += b[i]; -} - -static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) { - int offset = len % 16; - - __m128 ma0, ma1, ma2, ma3; - __m128 mb0, mb1, mb2, mb3; - - for (unsigned int k = 0; k < len / 16; k++, a += 16) { - ma0 = _mm_load_ps(a); - ma1 = _mm_load_ps(a + 4); - ma2 = _mm_load_ps(a + 8); - ma3 = _mm_load_ps(a + 12); - - for (int i = 0; i < batch; i++) { - mb0 = _mm_load_ps(b[i]); - mb1 = _mm_load_ps(b[i] + 4); - mb2 = _mm_load_ps(b[i] + 8); - mb3 = _mm_load_ps(b[i] + 12); - ma0 = _mm_add_ps(ma0, mb0); - ma1 = _mm_add_ps(ma1, mb1); - ma2 = _mm_add_ps(ma2, mb2); - ma3 = _mm_add_ps(ma3, mb3); - b[i] += 16; - } - - _mm_store_ps(a, ma0); - _mm_store_ps(a + 4, ma1); - _mm_store_ps(a + 8, ma2); - _mm_store_ps(a + 12, ma3); - } - - for (int i = 0; i < offset; i++) { - for (int k = 0; k < batch; k++) a[i] += b[k][i]; - } - return; -} - -static void col_max_sse(float* result, - const float* data, - int dim, - int numSamples) { - // first sample, direct copy - for (int d = 0; d < dim; ++d) { - result[d] = data[d]; - } - int offset = dim % 16; - __m128 ma0, ma1, ma2, ma3; - __m128 mb0, mb1, mb2, mb3; - // first 16n dims - for (int k = 0; k < dim / 16; k++, result += 16, data += 16) { - ma0 = _mm_load_ps(result); - ma1 = _mm_load_ps(result + 4); - ma2 = _mm_load_ps(result + 8); - ma3 = _mm_load_ps(result + 12); - for (int i = 1; i < numSamples; i++) { - mb0 = _mm_load_ps(data + i * dim); - mb1 = _mm_load_ps(data + i * dim + 4); - mb2 = _mm_load_ps(data + i * dim + 8); - mb3 = _mm_load_ps(data + i * dim + 12); - ma0 = _mm_max_ps(ma0, mb0); - ma1 = _mm_max_ps(ma1, mb1); - ma2 = _mm_max_ps(ma2, mb2); - ma3 = _mm_max_ps(ma3, mb3); - } - _mm_store_ps(result, ma0); - _mm_store_ps(result + 4, ma1); - _mm_store_ps(result + 8, ma2); - _mm_store_ps(result + 12, ma3); - } - // last dims - for (int d = 0; d < offset; ++d) { - float sm = data[d]; - for (int i = 1; i < numSamples; ++i) { - sm = std::max(sm, data[i * dim + d]); - } - result[d] = sm; - } -} - -#elif defined(__AVX__) - +#ifdef __AVX__ static void addto_avx(float* a, const float* b, size_t len) { int offset = len % 32; @@ -358,18 +248,128 @@ static void decayL1_avx( } } +#elif defined(__SSE3__) + +static void addto_sse(float* a, const float* b, size_t len) { + int offset = len % 16; + __m128 ma0, ma1, ma2, ma3; + __m128 mb0, mb1, mb2, mb3; + + for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) { + ma0 = _mm_load_ps(a); + ma1 = _mm_load_ps(a + 4); + ma2 = _mm_load_ps(a + 8); + ma3 = _mm_load_ps(a + 12); + + mb0 = _mm_load_ps(b); + mb1 = _mm_load_ps(b + 4); + mb2 = _mm_load_ps(b + 8); + mb3 = _mm_load_ps(b + 12); + + ma0 = _mm_add_ps(ma0, mb0); + ma1 = _mm_add_ps(ma1, mb1); + ma2 = _mm_add_ps(ma2, mb2); + ma3 = _mm_add_ps(ma3, mb3); + + _mm_store_ps(a, ma0); + _mm_store_ps(a + 4, ma1); + _mm_store_ps(a + 8, ma2); + _mm_store_ps(a + 12, ma3); + } + + for (int i = 0; i < offset; i++) a[i] += b[i]; +} + +static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) { + int offset = len % 16; + + __m128 ma0, ma1, ma2, ma3; + __m128 mb0, mb1, mb2, mb3; + + for (unsigned int k = 0; k < len / 16; k++, a += 16) { + ma0 = _mm_load_ps(a); + ma1 = _mm_load_ps(a + 4); + ma2 = _mm_load_ps(a + 8); + ma3 = _mm_load_ps(a + 12); + + for (int i = 0; i < batch; i++) { + mb0 = _mm_load_ps(b[i]); + mb1 = _mm_load_ps(b[i] + 4); + mb2 = _mm_load_ps(b[i] + 8); + mb3 = _mm_load_ps(b[i] + 12); + ma0 = _mm_add_ps(ma0, mb0); + ma1 = _mm_add_ps(ma1, mb1); + ma2 = _mm_add_ps(ma2, mb2); + ma3 = _mm_add_ps(ma3, mb3); + b[i] += 16; + } + + _mm_store_ps(a, ma0); + _mm_store_ps(a + 4, ma1); + _mm_store_ps(a + 8, ma2); + _mm_store_ps(a + 12, ma3); + } + + for (int i = 0; i < offset; i++) { + for (int k = 0; k < batch; k++) a[i] += b[k][i]; + } + return; +} + +static void col_max_sse(float* result, + const float* data, + int dim, + int numSamples) { + // first sample, direct copy + for (int d = 0; d < dim; ++d) { + result[d] = data[d]; + } + int offset = dim % 16; + __m128 ma0, ma1, ma2, ma3; + __m128 mb0, mb1, mb2, mb3; + // first 16n dims + for (int k = 0; k < dim / 16; k++, result += 16, data += 16) { + ma0 = _mm_load_ps(result); + ma1 = _mm_load_ps(result + 4); + ma2 = _mm_load_ps(result + 8); + ma3 = _mm_load_ps(result + 12); + for (int i = 1; i < numSamples; i++) { + mb0 = _mm_load_ps(data + i * dim); + mb1 = _mm_load_ps(data + i * dim + 4); + mb2 = _mm_load_ps(data + i * dim + 8); + mb3 = _mm_load_ps(data + i * dim + 12); + ma0 = _mm_max_ps(ma0, mb0); + ma1 = _mm_max_ps(ma1, mb1); + ma2 = _mm_max_ps(ma2, mb2); + ma3 = _mm_max_ps(ma3, mb3); + } + _mm_store_ps(result, ma0); + _mm_store_ps(result + 4, ma1); + _mm_store_ps(result + 8, ma2); + _mm_store_ps(result + 12, ma3); + } + // last dims + for (int d = 0; d < offset; ++d) { + float sm = data[d]; + for (int i = 1; i < numSamples; ++i) { + sm = std::max(sm, data[i * dim + d]); + } + result[d] = sm; + } +} + #endif -#ifdef __SSE__ -#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__) -#elif __AVX__ +#if defined(__AVX__) #define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__) +#elif defined(__SSE3__) +#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__) #endif namespace paddle { namespace simd { namespace internal { -#ifdef __SSE__ +#ifdef __SSE3__ void addToImpl(float* a, const float* b, size_t len) { SIMD_INVOKE(addto, a, b, len); } @@ -390,8 +390,8 @@ void decayL1AvxImpl( float* dst, float* src, float* lr, float lambda, size_t len) { decayL1_avx(dst, src, lr, lambda, len); } - #endif + } // namespace internal } // namespace simd } // namespace paddle diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h index 9df02faa9027dae7f00b08d0f8cea2d459b0324e..439f11b79d134d7054f45f2d0a70fc5a6fde6c13 100644 --- a/paddle/math/SIMDFunctions.h +++ b/paddle/math/SIMDFunctions.h @@ -128,7 +128,7 @@ void decayL1AvxImpl( template <> inline void addTo(float* a, const float* b, size_t len) { -#ifdef __SSE__ +#ifdef __SSE3__ internal::addToImpl(a, b, len); #else naive::addTo(a, b, len); @@ -137,7 +137,7 @@ inline void addTo(float* a, const float* b, size_t len) { template <> inline void batchAddTo(float* a, const float* b[], int batch, size_t len) { -#ifdef __SSE__ +#ifdef __SSE3__ internal::batchAddToImpl(a, b, batch, len); #else naive::batchAddTo(a, b, batch, len); @@ -146,7 +146,7 @@ inline void batchAddTo(float* a, const float* b[], int batch, size_t len) { template <> inline void colMax(float* result, const float* data, int dim, int numSamples) { -#ifdef __SSE__ +#ifdef __SSE3__ internal::colMaxImpl(result, data, dim, numSamples); #else naive::colMax(result, data, dim, numSamples);