diff --git a/paddle/cuda/include/hl_cpu_scalar.cuh b/paddle/cuda/include/hl_cpu_scalar.cuh index cddf08ce6b615dbb62885daf29c8c378892d40bf..93043cd4bc030ef525d5bcf8d83196f2ce92eec6 100644 --- a/paddle/cuda/include/hl_cpu_scalar.cuh +++ b/paddle/cuda/include/hl_cpu_scalar.cuh @@ -40,30 +40,6 @@ INLINE real hl_vec_set(const real r) { return r; } -INLINE real hl_vec_max(const real a, const real b) { - return a > b ? a : b; -} - -INLINE real hl_vec_min(const real a, const real b) { - return a > b ? b : a; -} - -INLINE real hl_vec_add(const real a, const real b) { - return a + b; -} - -INLINE real hl_vec_sub(const real a, const real b) { - return a - b; -} - -INLINE real hl_vec_mul(const real a, const real b) { - return a * b; -} - -INLINE real hl_vec_div(const real a, const real b) { - return a / b; -} - INLINE real hl_vec_classification_error(const real a, const real b, const real p, diff --git a/paddle/cuda/include/hl_cpu_simd_neon.cuh b/paddle/cuda/include/hl_cpu_simd_neon.cuh index 9ff360c576fe1f948999144e6fcba8c111d29008..0b1cf4abdc4d5ef2a640c75587308f7f082b854b 100644 --- a/paddle/cuda/include/hl_cpu_simd_neon.cuh +++ b/paddle/cuda/include/hl_cpu_simd_neon.cuh @@ -44,31 +44,6 @@ inline float32x4_t hl_vec_set(const real f) { return vdupq_n_f32(f); } -inline float32x4_t hl_vec_max(const float32x4_t a, const float32x4_t b) { - return vmaxq_f32(a, b); -} - -inline float32x4_t hl_vec_min(const float32x4_t a, const float32x4_t b) { - return vminq_f32(a, b); -} - -inline float32x4_t hl_vec_add(const float32x4_t a, const float32x4_t b) { - return vaddq_f32(a, b); -} - -inline float32x4_t hl_vec_sub(const float32x4_t a, const float32x4_t b) { - return vsubq_f32(a, b); -} - -inline float32x4_t hl_vec_mul(const float32x4_t a, const float32x4_t b) { - return vmulq_f32(a, b); -} - -inline float32x4_t hl_vec_div(const float32x4_t a, const float32x4_t b) { - float32x4_t tmp = vrecpeq_f32(b); - return vmulq_f32(a, tmp); -} - inline float32x4_t hl_vec_classification_error(const float32x4_t a, const float32x4_t b, const float32x4_t p, diff --git a/paddle/cuda/include/hl_cpu_simd_sse.cuh b/paddle/cuda/include/hl_cpu_simd_sse.cuh index 9a918770b14d0c106b89bfbc82ef5f7feec17b8c..a104b626220f473324fc3c99e7cd305c3e86f3db 100644 --- a/paddle/cuda/include/hl_cpu_simd_sse.cuh +++ b/paddle/cuda/include/hl_cpu_simd_sse.cuh @@ -45,30 +45,6 @@ inline __m128 hl_vec_set(const real f) { return _mm_set_ps1(f); } -inline __m128 hl_vec_max(const __m128 a, const __m128 b) { - return _mm_max_ps(a, b); -} - -inline __m128 hl_vec_min(const __m128 a, const __m128 b) { - return _mm_min_ps(a, b); -} - -inline __m128 hl_vec_add(const __m128 a, const __m128 b) { - return _mm_add_ps(a, b); -} - -inline __m128 hl_vec_sub(const __m128 a, const __m128 b) { - return _mm_sub_ps(a, b); -} - -inline __m128 hl_vec_mul(const __m128 a, const __m128 b) { - return _mm_mul_ps(a, b); -} - -inline __m128 hl_vec_div(const __m128 a, const __m128 b) { - return _mm_div_ps(a, b); -} - inline __m128 hl_vec_classification_error(const __m128 a, const __m128 b, const __m128 p, @@ -103,30 +79,6 @@ inline __m128d hl_vec_set(const real d) { #endif } -inline __m128d hl_vec_max(const __m128d a, const __m128d b) { - return _mm_max_pd(a, b); -} - -inline __m128d hl_vec_min(const __m128d a, const __m128d b) { - return _mm_min_pd(a, b); -} - -inline __m128d hl_vec_add(const __m128d a, const __m128d b) { - return _mm_add_pd(a, b); -} - -inline __m128d hl_vec_sub(const __m128d a, const __m128d b) { - return _mm_sub_pd(a, b); -} - -inline __m128d hl_vec_mul(const __m128d a, const __m128d b) { - return _mm_mul_pd(a, b); -} - -inline __m128d hl_vec_div(const __m128d a, const __m128d b) { - return _mm_div_pd(a, b); -} - inline __m128d hl_vec_classification_error(const __m128d a, const __m128d b, const __m128d p, diff --git a/paddle/cuda/include/hl_matrix_base_detail.cuh b/paddle/cuda/include/hl_matrix_base_detail.cuh index 50079ed53de7a7b6026284afb73b5335096c145b..de1fd17d524a486cc15af721731d2e815f17263e 100644 --- a/paddle/cuda/include/hl_matrix_base_detail.cuh +++ b/paddle/cuda/include/hl_matrix_base_detail.cuh @@ -16,13 +16,14 @@ limitations under the License. */ #define HL_MATRIX_BASE_DETAIL_CUH_ #include "hl_matrix_type.cuh" +#include "hl_tensor_ops.h" namespace aggregate { class SSESum { public: static const bool sse = VECTOR_SIMD; INLINE vecType vecOp(const vecType a, const vecType b) const { - return hl_vec_add(a, b); + return hppl::binary::add()(a, b); } }; @@ -30,7 +31,7 @@ class SSEMax { public: static const bool sse = VECTOR_SIMD; INLINE vecType vecOp(const vecType a, const vecType b) const { - return hl_vec_max(a, b); + return hppl::binary::max()(a, b); } }; @@ -38,7 +39,7 @@ class SSEMin { public: static const bool sse = VECTOR_SIMD; INLINE vecType vecOp(const vecType a, const vecType b) const { - return hl_vec_min(a, b); + return hppl::binary::min()(a, b); } }; } // namespace aggregate @@ -59,7 +60,7 @@ class SSEAdd { public: static const bool sse = VECTOR_SIMD; INLINE vecType vecOp(const vecType a, const vecType b) const { - return hl_vec_add(a, b); + return hppl::binary::add()(a, b); } }; @@ -77,7 +78,7 @@ public: mp2 = hl_vec_set(p2); } INLINE vecType vecOp(const vecType a, const vecType b) const { - return hl_vec_add(hl_vec_mul(mp1, a), hl_vec_mul(mp2, b)); + return hppl::binary::add_scale(mp1, mp2)(a, b); } }; @@ -85,7 +86,7 @@ class SSESub { public: static const bool sse = VECTOR_SIMD; INLINE vecType vecOp(const vecType a, const vecType b) const { - return hl_vec_sub(a, b); + return hppl::binary::sub()(a, b); } }; @@ -93,7 +94,7 @@ class SSEMul { public: static const bool sse = VECTOR_SIMD; INLINE vecType vecOp(const vecType a, const vecType b) const { - return hl_vec_mul(a, b); + return hppl::binary::mul()(a, b); } }; @@ -101,7 +102,7 @@ class SSEDiv { public: static const bool sse = VECTOR_SIMD; INLINE vecType vecOp(const vecType a, const vecType b) const { - return hl_vec_div(a, b); + return hppl::binary::div()(a, b); } }; @@ -109,7 +110,8 @@ class SSESquaredDiff { public: static const bool sse = VECTOR_SIMD; INLINE vecType vecOp(const vecType a, const vecType b) const { - return hl_vec_mul(hl_vec_sub(a, b), hl_vec_sub(a, b)); + vecType tmp = hppl::binary::sub()(a, b); + return hppl::binary::mul()(tmp, tmp); } }; diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh index 12c717b612da907ac7aedf3e7787bdbb96948be2..e18235219bd9f78dd87a92d448cb290d9a4904a1 100644 --- a/paddle/cuda/include/hl_matrix_type.cuh +++ b/paddle/cuda/include/hl_matrix_type.cuh @@ -38,10 +38,12 @@ typedef double2 vecType; #endif #elif defined(__SSE3__) #include "hl_cpu_simd_sse.cuh" +#define PADDLE_USE_SSE3 #elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !defined(__NVCC__) // Currently nvcc does not support neon intrinsic. // TODO: Extract simd intrinsic implementation from .cu files. #include "hl_cpu_simd_neon.cuh" +#define PADDLE_USE_NEON #else #include "hl_cpu_scalar.cuh" #endif diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h index 7945b98201b1812790fb0d53123e9ee007640485..523503f5fec52ad01cff1bf9bbc04f271b0e4867 100644 --- a/paddle/cuda/include/hl_tensor_ops.h +++ b/paddle/cuda/include/hl_tensor_ops.h @@ -328,6 +328,208 @@ public: INLINE T operator()(const T a, const T b) const { return a < b ? b : a; } }; +#ifdef PADDLE_USE_SSE3 +#ifndef PADDLE_TYPE_DOUBLE +template <> +class add<__m128> { +public: + INLINE __m128 operator()(const __m128 a, const __m128 b) const { + return _mm_add_ps(a, b); + } +}; + +template <> +class add_scale<__m128> { +private: + const __m128 p1; + const __m128 p2; + +public: + INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {} + INLINE __m128 operator()(const __m128 a, const __m128 b) const { + return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b)); + } +}; + +template <> +class sub<__m128> { +public: + INLINE __m128 operator()(const __m128 a, const __m128 b) const { + return _mm_sub_ps(a, b); + } +}; + +template <> +class mul<__m128> { +public: + INLINE __m128 operator()(const __m128 a, const __m128 b) const { + return _mm_mul_ps(a, b); + } +}; + +template <> +class div<__m128> { +public: + INLINE __m128 operator()(const __m128 a, const __m128 b) const { + return _mm_div_ps(a, b); + } +}; + +template <> +class min<__m128> { +public: + INLINE __m128 operator()(const __m128 a, const __m128 b) const { + return _mm_min_ps(a, b); + } +}; + +template <> +class max<__m128> { +public: + INLINE __m128 operator()(const __m128 a, const __m128 b) const { + return _mm_max_ps(a, b); + } +}; +#else +template <> +class add<__m128d> { +public: + INLINE __m128d operator()(const __m128d a, const __m128d b) const { + return _mm_add_pd(a, b); + } +}; + +template <> +class add_scale<__m128d> { +private: + const __m128d p1; + const __m128d p2; + +public: + INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {} + INLINE __m128d operator()(const __m128d a, const __m128d b) const { + return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b)); + } +}; + +template <> +class sub<__m128d> { +public: + INLINE __m128d operator()(const __m128d a, const __m128d b) const { + return _mm_sub_pd(a, b); + } +}; + +template <> +class mul<__m128d> { +public: + INLINE __m128d operator()(const __m128d a, const __m128d b) const { + return _mm_mul_pd(a, b); + } +}; + +template <> +class div<__m128d> { +public: + INLINE __m128d operator()(const __m128d a, const __m128d b) const { + return _mm_div_pd(a, b); + } +}; + +template <> +class min<__m128d> { +public: + INLINE __m128d operator()(const __m128d a, const __m128d b) const { + return _mm_min_pd(a, b); + } +}; + +template <> +class max<__m128d> { +public: + INLINE __m128d operator()(const __m128d a, const __m128d b) const { + return _mm_max_pd(a, b); + } +}; +#endif // PADDLE_TYPE_DOUBLE +#endif // PADDLE_USE_SSE3 + +#ifdef PADDLE_USE_NEON +#ifndef PADDLE_TYPE_DOUBLE +template <> +class add { +public: + INLINE float32x4_t operator()(const float32x4_t a, + const float32x4_t b) const { + return vmulq_f32(a, b); + } +}; + +template <> +class add_scale { +private: + const float32x4_t p1; + const float32x4_t p2; + +public: + INLINE add_scale(const float32x4_t s1, const float32x4_t s2) + : p1(s1), p2(s2) {} + INLINE float32x4_t operator()(const float32x4_t a, + const float32x4_t b) const { + return vaddq_f32(vmulq_f32(p1, a), vmulq_f32(p2, b)); + } +}; + +template <> +class sub { +public: + INLINE float32x4_t operator()(const float32x4_t a, + const float32x4_t b) const { + return vsubq_f32(a, b); + } +}; + +template <> +class mul { +public: + INLINE float32x4_t operator()(const float32x4_t a, + const float32x4_t b) const { + return vmulq_f32(a, b); + } +}; + +template <> +class div { +public: + INLINE float32x4_t operator()(const float32x4_t a, + const float32x4_t b) const { + float32x4_t tmp = vrecpeq_f32(b); + return vmulq_f32(a, tmp); + } +}; + +template <> +class min { +public: + INLINE float32x4_t operator()(const float32x4_t a, + const float32x4_t b) const { + return vminq_f32(a, b); + } +}; + +template <> +class max { +public: + INLINE float32x4_t operator()(const float32x4_t a, + const float32x4_t b) const { + return vmaxq_f32(a, b); + } +} +#else +#error To be implemented +#endif // PADDLE_TYPE_DOUBLE +#endif // PADDLE_USE_NEON + } // namespace binary } // namespace hppl