提交 38fa74ed 编写于 作者: L Liu Yiqun

Fix cmake error of failing to find UINT64_MAX.

上级 f261dc6a
...@@ -25,6 +25,7 @@ find_package(Git REQUIRED) ...@@ -25,6 +25,7 @@ find_package(Git REQUIRED)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
include(system) include(system)
include(simd)
################################ Configurations ####################################### ################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
...@@ -64,7 +65,6 @@ include(external/openblas) # download, build, install openblas ...@@ -64,7 +65,6 @@ include(external/openblas) # download, build, install openblas
include(external/swig) # download, build, install swig include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(simd) # set simd flag
include(package) # set paddle packages include(package) # set paddle packages
include(cpplint) # set paddle c++ style include(cpplint) # set paddle c++ style
include(ccache) # set ccache for compilation include(ccache) # set ccache for compilation
......
...@@ -32,6 +32,16 @@ if(NOT WITH_PROFILER) ...@@ -32,6 +32,16 @@ if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER) endif(NOT WITH_PROFILER)
if(NEON_FOUND)
set(SIMD_FLAG ${NEON_FLAG})
else(NEON_FOUND)
if(WITH_AVX)
set(SIMD_FLAG ${AVX_FLAG})
else(WITH_AVX)
set(SIMD_FLAG ${SSE3_FLAG})
endif(WITH_AVX)
endif(NEON_FOUND)
if(NOT WITH_GPU) if(NOT WITH_GPU)
add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DPADDLE_ONLY_CPU)
add_definitions(-DHPPL_STUB_FUNC) add_definitions(-DHPPL_STUB_FUNC)
...@@ -48,11 +58,7 @@ else() ...@@ -48,11 +58,7 @@ else()
message(FATAL_ERROR "Paddle need cudnn to compile") message(FATAL_ERROR "Paddle need cudnn to compile")
endif() endif()
if(WITH_AVX) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
else(WITH_AVX)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
endif(WITH_AVX)
# Include cuda and cudnn # Include cuda and cudnn
include_directories(${CUDNN_INCLUDE_DIR}) include_directories(${CUDNN_INCLUDE_DIR})
......
...@@ -88,14 +88,5 @@ int main() ...@@ -88,14 +88,5 @@ int main()
return 0; return 0;
}" NEON_FOUND) }" NEON_FOUND)
if(NEON_FOUND) set(CMAKE_REQUIRED_FLAGS "")
set(SIMD_FLAG ${NEON_FLAG})
else(NEON_FOUND)
if(WITH_AVX)
set(SIMD_FLAG ${AVX_FLAG})
else(WITH_AVX)
set(SIMD_FLAG ${SSE3_FLAG})
endif(WITH_AVX)
endif(NEON_FOUND)
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND NEON_FOUND) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND NEON_FOUND)
...@@ -13,122 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,122 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "SIMDFunctions.h" #include "SIMDFunctions.h"
#ifdef __SSE__ #ifdef __SSE3__
#include <immintrin.h> #include <immintrin.h>
#endif #endif
#include <algorithm> #include <algorithm>
#ifdef __SSE__ #ifdef __AVX__
static void addto_sse(float* a, const float* b, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
mb0 = _mm_load_ps(b);
mb1 = _mm_load_ps(b + 4);
mb2 = _mm_load_ps(b + 8);
mb3 = _mm_load_ps(b + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) a[i] += b[i];
}
static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
for (int i = 0; i < batch; i++) {
mb0 = _mm_load_ps(b[i]);
mb1 = _mm_load_ps(b[i] + 4);
mb2 = _mm_load_ps(b[i] + 8);
mb3 = _mm_load_ps(b[i] + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
b[i] += 16;
}
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) {
for (int k = 0; k < batch; k++) a[i] += b[k][i];
}
return;
}
static void col_max_sse(float* result,
const float* data,
int dim,
int numSamples) {
// first sample, direct copy
for (int d = 0; d < dim; ++d) {
result[d] = data[d];
}
int offset = dim % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
// first 16n dims
for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
ma0 = _mm_load_ps(result);
ma1 = _mm_load_ps(result + 4);
ma2 = _mm_load_ps(result + 8);
ma3 = _mm_load_ps(result + 12);
for (int i = 1; i < numSamples; i++) {
mb0 = _mm_load_ps(data + i * dim);
mb1 = _mm_load_ps(data + i * dim + 4);
mb2 = _mm_load_ps(data + i * dim + 8);
mb3 = _mm_load_ps(data + i * dim + 12);
ma0 = _mm_max_ps(ma0, mb0);
ma1 = _mm_max_ps(ma1, mb1);
ma2 = _mm_max_ps(ma2, mb2);
ma3 = _mm_max_ps(ma3, mb3);
}
_mm_store_ps(result, ma0);
_mm_store_ps(result + 4, ma1);
_mm_store_ps(result + 8, ma2);
_mm_store_ps(result + 12, ma3);
}
// last dims
for (int d = 0; d < offset; ++d) {
float sm = data[d];
for (int i = 1; i < numSamples; ++i) {
sm = std::max(sm, data[i * dim + d]);
}
result[d] = sm;
}
}
#elif defined(__AVX__)
static void addto_avx(float* a, const float* b, size_t len) { static void addto_avx(float* a, const float* b, size_t len) {
int offset = len % 32; int offset = len % 32;
...@@ -358,18 +248,128 @@ static void decayL1_avx( ...@@ -358,18 +248,128 @@ static void decayL1_avx(
} }
} }
#elif defined(__SSE3__)
static void addto_sse(float* a, const float* b, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
mb0 = _mm_load_ps(b);
mb1 = _mm_load_ps(b + 4);
mb2 = _mm_load_ps(b + 8);
mb3 = _mm_load_ps(b + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) a[i] += b[i];
}
static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
for (int i = 0; i < batch; i++) {
mb0 = _mm_load_ps(b[i]);
mb1 = _mm_load_ps(b[i] + 4);
mb2 = _mm_load_ps(b[i] + 8);
mb3 = _mm_load_ps(b[i] + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
b[i] += 16;
}
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) {
for (int k = 0; k < batch; k++) a[i] += b[k][i];
}
return;
}
static void col_max_sse(float* result,
const float* data,
int dim,
int numSamples) {
// first sample, direct copy
for (int d = 0; d < dim; ++d) {
result[d] = data[d];
}
int offset = dim % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
// first 16n dims
for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
ma0 = _mm_load_ps(result);
ma1 = _mm_load_ps(result + 4);
ma2 = _mm_load_ps(result + 8);
ma3 = _mm_load_ps(result + 12);
for (int i = 1; i < numSamples; i++) {
mb0 = _mm_load_ps(data + i * dim);
mb1 = _mm_load_ps(data + i * dim + 4);
mb2 = _mm_load_ps(data + i * dim + 8);
mb3 = _mm_load_ps(data + i * dim + 12);
ma0 = _mm_max_ps(ma0, mb0);
ma1 = _mm_max_ps(ma1, mb1);
ma2 = _mm_max_ps(ma2, mb2);
ma3 = _mm_max_ps(ma3, mb3);
}
_mm_store_ps(result, ma0);
_mm_store_ps(result + 4, ma1);
_mm_store_ps(result + 8, ma2);
_mm_store_ps(result + 12, ma3);
}
// last dims
for (int d = 0; d < offset; ++d) {
float sm = data[d];
for (int i = 1; i < numSamples; ++i) {
sm = std::max(sm, data[i * dim + d]);
}
result[d] = sm;
}
}
#endif #endif
#ifdef __SSE__ #if defined(__AVX__)
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#elif __AVX__
#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__) #define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
#elif defined(__SSE3__)
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#endif #endif
namespace paddle { namespace paddle {
namespace simd { namespace simd {
namespace internal { namespace internal {
#ifdef __SSE__ #ifdef __SSE3__
void addToImpl(float* a, const float* b, size_t len) { void addToImpl(float* a, const float* b, size_t len) {
SIMD_INVOKE(addto, a, b, len); SIMD_INVOKE(addto, a, b, len);
} }
...@@ -390,8 +390,8 @@ void decayL1AvxImpl( ...@@ -390,8 +390,8 @@ void decayL1AvxImpl(
float* dst, float* src, float* lr, float lambda, size_t len) { float* dst, float* src, float* lr, float lambda, size_t len) {
decayL1_avx(dst, src, lr, lambda, len); decayL1_avx(dst, src, lr, lambda, len);
} }
#endif #endif
} // namespace internal } // namespace internal
} // namespace simd } // namespace simd
} // namespace paddle } // namespace paddle
...@@ -128,7 +128,7 @@ void decayL1AvxImpl( ...@@ -128,7 +128,7 @@ void decayL1AvxImpl(
template <> template <>
inline void addTo(float* a, const float* b, size_t len) { inline void addTo(float* a, const float* b, size_t len) {
#ifdef __SSE__ #ifdef __SSE3__
internal::addToImpl(a, b, len); internal::addToImpl(a, b, len);
#else #else
naive::addTo(a, b, len); naive::addTo(a, b, len);
...@@ -137,7 +137,7 @@ inline void addTo(float* a, const float* b, size_t len) { ...@@ -137,7 +137,7 @@ inline void addTo(float* a, const float* b, size_t len) {
template <> template <>
inline void batchAddTo(float* a, const float* b[], int batch, size_t len) { inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
#ifdef __SSE__ #ifdef __SSE3__
internal::batchAddToImpl(a, b, batch, len); internal::batchAddToImpl(a, b, batch, len);
#else #else
naive::batchAddTo(a, b, batch, len); naive::batchAddTo(a, b, batch, len);
...@@ -146,7 +146,7 @@ inline void batchAddTo(float* a, const float* b[], int batch, size_t len) { ...@@ -146,7 +146,7 @@ inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
template <> template <>
inline void colMax(float* result, const float* data, int dim, int numSamples) { inline void colMax(float* result, const float* data, int dim, int numSamples) {
#ifdef __SSE__ #ifdef __SSE3__
internal::colMaxImpl(result, data, dim, numSamples); internal::colMaxImpl(result, data, dim, numSamples);
#else #else
naive::colMax(result, data, dim, numSamples); naive::colMax(result, data, dim, numSamples);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册