Merge pull request #2299 from Xreki/support_scalar_kernels

Centralize the use of simd intrinsic and implement scalar kernels.

Merge pull request #2299 from Xreki/support_scalar_kernels
Centralize the use of simd intrinsic and implement scalar kernels.
e36e24d4 · Yiqun Liu · GitHub · 4537b7bc · 7fb0684a · e36e24d4
11 changed file
--- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
@@ -17,10 +17,9 @@ limitations under the License. */
 #include <stdio.h>
 #include "hl_base.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include "hl_neon_matrix_kernel.cuh"
+#ifndef __CUDA_ARCH__
-#else
+#include "hl_cpu_matrix_kernel_detail.cuh"
-#include "hl_sse_matrix_kernel.cuh"
 #endif
 /**
@@ -114,35 +113,6 @@ void hl_cpu_apply_quaternary_op(Op op,
  }
 }
-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda,
-                      real *B, int ldb) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
 template <class Agg, class Op, class Saver>
 void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
                          int dimM, int dimN,

--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -13,26 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
-#ifndef HL_SSE_MATRIX_KERNEL_CUH_
+#ifndef HL_MATRIX_KERNEL_DETAIL_CUH_
-#define HL_SSE_MATRIX_KERNEL_CUH_
+#define HL_MATRIX_KERNEL_DETAIL_CUH_
 #include "hl_matrix_type.cuh"
-#define VECTOR_SIZE     16
-#ifndef PADDLE_TYPE_DOUBLE
-/* number of float in vector */
-#define     VECTOR_LEN      4
-#define     VECTOR_SET      _mm_set_ps1
-#else
-#if   defined(__APPLE__) || defined(__OSX__)
-#define     _mm_set_pd1     _mm_set1_pd
-#endif
-/* number of double in vector */
-#define     VECTOR_LEN      2
-#define     VECTOR_SET      _mm_set_pd1
-#endif
 inline bool hl_check_align(size_t size) {
  return !(size & (VECTOR_SIZE - 1));
 }
@@ -41,27 +26,63 @@ inline bool hl_check_align(void *ptr) {
  return hl_check_align(reinterpret_cast<size_t>(ptr));
 }
-#ifndef PADDLE_TYPE_DOUBLE
+template <class Agg, class Op, class Saver>
-template <class Agg>
+void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-inline real hl_agg_op(Agg agg, vecType mm) {
+                      int dimM, int dimN,
-  __m128 lo = _mm_unpacklo_ps(mm, mm);
+                      real *dst, int ld,
-  __m128 hi = _mm_unpackhi_ps(mm, mm);
+                      real *A, int lda) {
-  __m128 tmp1 = agg.vecOp(lo, hi);
+  for (int i = 0; i < dimM; i++) {
-  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
+    real tmp = agg.init();
-  __m128 ret = agg.vecOp(tmp1, tmp2);
+    for (int j = 0; j < dimN; j++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[i*ld] = sv(dst[i*ld], tmp);
+  }
+}
-  return _mm_cvtss_f32(ret);
+template <class Agg, class Op, class Saver>
+void hl_matrix_row_op(Agg agg, Op op, Saver sv,
+                      int dimM, int dimN,
+                      real *dst, int ld,
+                      real *A, int lda,
+                      real *B, int ldb) {
+  for (int i = 0; i < dimM; i++) {
+    real tmp = agg.init();
+    for (int j = 0; j < dimN; j++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[i*ld] = sv(dst[i*ld], tmp);
+  }
 }
-#else
-template <class Agg>
+template <class Agg, class Op, class Saver>
-inline real hl_agg_op(Agg agg, vecType mm) {
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-  __m128d lo = _mm_unpacklo_pd(mm, mm);
+                         int dimM, int dimN,
-  __m128d hi = _mm_unpackhi_pd(mm, mm);
+                         real *dst,
-  __m128d ret = agg.vecOp(lo, hi);
+                         real *A, int lda) {
+  for (int j = 0; j < dimN; j++) {
-  return _mm_cvtsd_f64(ret);
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
+}
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda,
+                         real *B, int ldb) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
 }
-#endif
 template <class Agg, class Op, class Saver>
 void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
@@ -118,35 +139,6 @@ void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
  }
 }
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda,
-                         real *B, int ldb) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
 /*
 * MaxRow greater than or equal dimN
 * dimN is multiples of VECTOR_LEN
@@ -315,4 +307,4 @@ void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
  }
 }
-#endif /* HL_SSE_MATRIX_KERNEL_CUH_ */
+#endif /* HL_MATRIX_KERNEL_DETAIL_CUH_ */
--- a/paddle/cuda/include/hl_cpu_scalar.cuh
+++ b/paddle/cuda/include/hl_cpu_scalar.cuh
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef HL_CPU_SCALAR_CUH_
+#define HL_CPU_SCALAR_CUH_
+#define VECTOR_SIMD false
+#define VECTOR_SET  hl_vec_set
+#ifndef PADDLE_TYPE_DOUBLE
+/* size of float */
+#define VECTOR_SIZE 4
+#else
+/* size of double */
+#define VECTOR_SIZE 8
+#endif
+typedef real vecType;
+/* Consider a real as a vector */
+#define VECTOR_LEN  1
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  return mm;
+}
+INLINE real hl_vec_set(const real r) {
+  return r;
+}
+INLINE real hl_vec_classification_error(const real a,
+                                        const real b,
+                                        const real p,
+                                        const real r) {
+  return ((a > p) == (b > p)) ? 0.0f : 1.0f;
+}
+#endif  // HL_CPU_SCALAR_CUH_
--- a/paddle/cuda/include/hl_cpu_simd_neon.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_neon.cuh
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef HL_CPU_SIMD_NEON_CUH_
+#define HL_CPU_SIMD_NEON_CUH_
+#include <arm_neon.h>
+#define VECTOR_SIMD true
+#define VECTOR_SIZE 16
+#define VECTOR_SET  hl_vec_set
+#ifndef PADDLE_TYPE_DOUBLE
+typedef float32x4_t vecType;
+/* number of float in vector */
+#define VECTOR_LEN  4
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  float32x4_t rev = vrev64q_f32(mm);
+  float32x4_t tmp1 = agg.vecOp(rev, rev);
+  float32x2_t lo = vget_high_f32(rev);
+  float32x2_t hi = vget_low_f32(rev);
+  float32x4_t tmp2 = vcombine_f32(hi, lo);
+  float32x4_t ret = agg.vecOp(tmp1, tmp2);
+  return vgetq_lane_f32(ret, 0);
+}
+inline float32x4_t hl_vec_set(const real f) {
+  return vdupq_n_f32(f);
+}
+inline float32x4_t hl_vec_classification_error(const float32x4_t a,
+                                               const float32x4_t b,
+                                               const float32x4_t p,
+                                               const float32x4_t r) {
+  uint32x4_t tmp1 = vcgtq_f32(a, p);
+  uint32x4_t tmp2 = vcgtq_f32(b, p);
+  uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
+  return vcvtq_f32_u32(vandq_u32(tmp3, vcvtq_u32_f32(r)));
+}
+#else
+#ifdef __aarch64__
+typedef float64x2_t vecType;
+/* number of float in vector */
+#define VECTOR_LEN  2
+#define VECTOR_SET  vdupq_n_f64
+#error To be implemented
+#else
+#error NEON instructions does not support double precision
+#endif  // __aarch64__
+#endif
+#endif  // HL_CPU_SIMD_NEON_CUH_
--- a/paddle/cuda/include/hl_cpu_simd_sse.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_sse.cuh
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef HL_CPU_SIMD_SSE_CUH_
+#define HL_CPU_SIMD_SSE_CUH_
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#define VECTOR_SIMD true
+#define VECTOR_SIZE 16
+#define VECTOR_SET  hl_vec_set
+#ifndef PADDLE_TYPE_DOUBLE
+typedef __m128  vecType;
+/* number of float in vector */
+#define VECTOR_LEN  4
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  __m128 lo = _mm_unpacklo_ps(mm, mm);
+  __m128 hi = _mm_unpackhi_ps(mm, mm);
+  __m128 tmp1 = agg.vecOp(lo, hi);
+  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
+  __m128 ret = agg.vecOp(tmp1, tmp2);
+  return _mm_cvtss_f32(ret);
+}
+inline __m128 hl_vec_set(const real f) {
+  return _mm_set_ps1(f);
+}
+inline __m128 hl_vec_classification_error(const __m128 a,
+                                          const __m128 b,
+                                          const __m128 p,
+                                          const __m128 r) {
+  __m128 tmp1 = _mm_cmpgt_ps(a, p);
+  __m128 tmp2 = _mm_cmpgt_ps(b, p);
+  __m128 tmp3 = _mm_xor_ps(tmp1, tmp2);
+  return _mm_and_ps(tmp3, r);
+}
+#else
+typedef __m128d vecType;
+/* number of double in vector */
+#define VECTOR_LEN  2
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  __m128d lo = _mm_unpacklo_pd(mm, mm);
+  __m128d hi = _mm_unpackhi_pd(mm, mm);
+  __m128d ret = agg.vecOp(lo, hi);
+  return _mm_cvtsd_f64(ret);
+}
+inline __m128d hl_vec_set(const real d) {
+#if defined(__APPLE__) || defined(__OSX__)
+  return _mm_set1_pd(d);
+#else
+  return _mm_set_pd1(d);
+#endif
+}
+inline __m128d hl_vec_classification_error(const __m128d a,
+                                           const __m128d b,
+                                           const __m128d p,
+                                           const __m128d r) {
+  __m128d tmp1 = _mm_cmpgt_pd(a, p);
+  __m128d tmp2 = _mm_cmpgt_pd(b, p);
+  __m128d tmp3 = _mm_xor_pd(tmp1, tmp2);
+  return _mm_and_pd(tmp3, r);
+}
+#endif
+#endif  // HL_CPU_SIMD_SSE_CUH_
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -18,26 +18,6 @@ limitations under the License. */
 #include "hl_matrix_type.cuh"
-#ifdef __CUDA_ARCH__
-/**
- * CUDA kernel inline function
- */
-#define INLINE   __device__ inline
-#else
-/**
- * CPP inline function
- */
-#define INLINE   inline
-#endif
-#ifndef PADDLE_TYPE_DOUBLE
-#define     DEVICE_FMAX     fmaxf
-#define     DEVICE_FMIN     fminf
-#else
-#define     DEVICE_FMAX     fmax
-#define     DEVICE_FMIN     fmin
-#endif
 class BaseOp {
 public:
  static const bool sse = false;
@@ -66,10 +46,8 @@ typedef BaseOp SSESquaredDiff;
 typedef BaseOp SSEFirst;
 typedef BaseOp SSESecond;
 typedef BaseOp SSEClassificationError;
-#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
-#include "hl_matrix_base_neon.cuh"
 #else
-#include "hl_matrix_base_sse.cuh"
+#include "hl_matrix_base_detail.cuh"
 #endif
 namespace aggregate {
@@ -124,7 +102,7 @@ public:
  add2(const real s1, const real s2)
    : SSEAdd2(s1, s2), p1(s1), p2(s2) {}
  INLINE real operator()(const real a, const real b) const {
-     return p1 * a + p2 * b;
+    return p1 * a + p2 * b;
  }
 };

--- a/paddle/cuda/include/hl_matrix_base_neon.cuh
+++ b/paddle/cuda/include/hl_matrix_base_neon.cuh
@@ -12,32 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifndef HL_MATRIX_BASE_DETAIL_CUH_
+#define HL_MATRIX_BASE_DETAIL_CUH_
-#ifndef HL_MATRIX_BASE_NEON_CUH_
+#include "hl_matrix_type.cuh"
-#define HL_MATRIX_BASE_NEON_CUH_
+#include "hl_tensor_ops.h"
 namespace aggregate {
 class SSESum {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vaddq_f32(a, b);
+    return hppl::binary::add<vecType>()(a, b);
  }
 };
 class SSEMax {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vmaxq_f32(a, b);
+    return hppl::binary::max<vecType>()(a, b);
  }
 };
 class SSEMin {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vminq_f32(a, b);
+    return hppl::binary::min<vecType>()(a, b);
  }
 };
 }  // namespace aggregate
@@ -46,8 +48,8 @@ namespace base {
 namespace unary {
 class SSEIdentity {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a) const {
+  INLINE vecType vecOp(const vecType a) const {
    return a;
  }
 };
@@ -56,106 +58,96 @@ public:
 namespace binary {
 class SSEAdd {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vaddq_f32(a, b);
+    return hppl::binary::add<vecType>()(a, b);
  }
 };
 class SSEAdd2 {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
  const real p1;
  const real p2;
-  float32x4_t mp1;
+  vecType mp1;
-  float32x4_t mp2;
+  vecType mp2;
 public:
  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
-    mp1 = vdupq_n_f32(p1);
+    mp1 = hl_vec_set(p1);
-    mp2 = vdupq_n_f32(p2);
+    mp2 = hl_vec_set(p2);
  }
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    float32x4_t tmp1, tmp2;
+    return hppl::binary::add_scale<vecType>(mp1, mp2)(a, b);
-    tmp1 = vmulq_f32(mp1, a);
-    tmp2 = vmulq_f32(mp2, b);
-    return vaddq_f32(tmp1, tmp2);
  }
 };
 class SSESub {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vsubq_f32(a, b);
+    return hppl::binary::sub<vecType>()(a, b);
  }
 };
 class SSEMul {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vmulq_f32(a, b);
+    return hppl::binary::mul<vecType>()(a, b);
  }
 };
 class SSEDiv {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    float32x4_t tmp;
+    return hppl::binary::div<vecType>()(a, b);
-    tmp = vrecpeq_f32(b);
-    return vmulq_f32(a, tmp);
  }
 };
 class SSESquaredDiff {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    float32x4_t tmp;
+    vecType tmp = hppl::binary::sub<vecType>()(a, b);
-    tmp = vsubq_f32(a, b);
+    return hppl::binary::mul<vecType>()(tmp, tmp);
-    return vmulq_f32(tmp, tmp);
  }
 };
 class SSEFirst {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return a;
  }
 };
 class SSESecond {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return b;
  }
 };
 class SSEClassificationError {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
  const real p;
-  float32x4_t mp;
+  vecType mp;
-  uint32x4_t result;
+  vecType result;
 public:
  explicit SSEClassificationError(const real s) : p(s) {
-    mp = vdupq_n_f32(p);
+    mp = hl_vec_set(p);
-    result = vdupq_n_u32(1);
+    result = hl_vec_set(1.0f);
  }
-  // TODO: to be check
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return hl_vec_classification_error(a, b, mp, result);
-    uint32x4_t tmp1 = vcgtq_f32(a, mp);
-    uint32x4_t tmp2 = vcgtq_f32(b, mp);
-    uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
-    return vcvtq_f32_u32(vandq_u32(tmp3, result));
  }
 };
 }  // namespace binary
 }  // namespace base
-#endif /* HL_MATRIX_BASE_NEON_CUH_ */
+#endif /* HL_MATRIX_BASE_DETAIL_CUH_ */
--- a/paddle/cuda/include/hl_matrix_base_sse.cuh
+++ b/paddle/cuda/include/hl_matrix_base_sse.cuh
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef HL_MATRIX_BASE_SSE_CUH_
-#define HL_MATRIX_BASE_SSE_CUH_
-namespace aggregate {
-class SSESum {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(a, b);
-  }
-};
-class SSEMax {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_max_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_max_pd(a, b);
-  }
-};
-class SSEMin {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_min_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_min_pd(a, b);
-  }
-};
-}  // namespace aggregate
-namespace base {
-namespace unary {
-class SSEIdentity {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a) const {
-    return a;
-  }
-  INLINE __m128d vecOp(const __m128d a) const {
-    return a;
-  }
-};
-}  // namespace unary
-namespace binary {
-class SSEAdd {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(a, b);
-  }
-};
-class SSEAdd2 {
-public:
-  static const bool sse = true;
-  const real p1;
-  const real p2;
-  union {__m128 f; __m128d d;} mp1;
-  union {__m128 f; __m128d d;} mp2;
-public:
-  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
-    if (sizeof(real) == sizeof(float)) {
-      mp1.f = _mm_set1_ps(p1);
-      mp2.f = _mm_set1_ps(p2);
-    } else {
-      mp1.d = _mm_set1_pd(p1);
-      mp2.d = _mm_set1_pd(p2);
-    }
-  }
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    __m128 tmp1, tmp2;
-    tmp1 = _mm_mul_ps(mp1.f, a);
-    tmp2 = _mm_mul_ps(mp2.f, b);
-    return _mm_add_ps(tmp1, tmp2);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    __m128d tmp1, tmp2;
-    tmp1 = _mm_mul_pd(mp1.d, a);
-    tmp2 = _mm_mul_pd(mp2.d, b);
-    return _mm_add_pd(tmp1, tmp2);
-  }
-};
-class SSESub {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_sub_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_sub_pd(a, b);
-  }
-};
-class SSEMul {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_mul_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_mul_pd(a, b);
-  }
-};
-class SSEDiv {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_div_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_div_pd(a, b);
-  }
-};
-class SSESquaredDiff {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_mul_ps(_mm_sub_ps(a, b), _mm_sub_ps(a, b));
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_mul_pd(_mm_sub_pd(a, b), _mm_sub_pd(a, b));
-  }
-};
-class SSEFirst {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return a;
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return a;
-  }
-};
-class SSESecond {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return b;
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return b;
-  }
-};
-class SSEClassificationError {
-public:
-  static const bool sse = true;
-  const real p;
-  union {__m128 f; __m128d d;} mp;
-  union {__m128 f; __m128d d;} result;
-public:
-  explicit SSEClassificationError(const real s) : p(s) {
-    if (sizeof(real) == sizeof(float)) {
-      mp.f = _mm_set1_ps(p);
-      result.f = _mm_set1_ps(1.0f);
-    } else {
-      mp.d = _mm_set1_pd(p);
-      result.d = _mm_set1_pd(1.0);
-    }
-  }
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    __m128 tmp1 = _mm_cmpgt_ps(a, mp.f);
-    __m128 tmp2 = _mm_cmpgt_ps(b, mp.f);
-    __m128 tmp3 = _mm_xor_ps(tmp1, tmp2);
-    return _mm_and_ps(tmp3, result.f);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    __m128d tmp1 = _mm_cmpgt_pd(a, mp.d);
-    __m128d tmp2 = _mm_cmpgt_pd(b, mp.d);
-    __m128d tmp3 = _mm_xor_pd(tmp1, tmp2);
-    return _mm_and_pd(tmp3, result.d);
-  }
-};
-}  // namespace binary
-}  // namespace base
-#endif /* HL_MATRIX_BASE_SSE_CUH_ */
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -17,35 +17,35 @@ limitations under the License. */
 #include "hl_base.h"
-#if defined(__CUDA_ARCH__)
+#ifdef __CUDA_ARCH__
+/**
+ * CUDA kernel inline function
+ */
+#define INLINE   __device__ inline
+#else
+/**
+ * CPP inline function
+ */
+#define INLINE   inline
+#endif
+#ifdef __CUDA_ARCH__
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
 #else
 typedef double2 vecType;
 #endif
-#elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+#elif defined(__SSE3__)
-#include <arm_neon.h>
+#include "hl_cpu_simd_sse.cuh"
-#ifndef PADDLE_TYPE_DOUBLE
+#define PADDLE_USE_SSE3
-typedef float32x4_t  vecType;
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !defined(__NVCC__)
-#else
+// Currently nvcc does not support neon intrinsic.
-#error NEON instructions does not support double precision
+// TODO: Extract simd intrinsic implementation from .cu files.
-#endif
+#include "hl_cpu_simd_neon.cuh"
+#define PADDLE_USE_NEON
 #else
-#include <mmintrin.h>
+#include "hl_cpu_scalar.cuh"
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#ifndef PADDLE_TYPE_DOUBLE
-typedef __m128  vecType;
-#else
-typedef __m128d vecType;
-#endif
-#endif
-#ifdef __CUDA_ARCH__
-#define INLINE   __device__ inline
-#else
-#define INLINE   inline
 #endif
 #endif  // HL_MATRIX_TYPE_CUH_
--- a/paddle/cuda/include/hl_neon_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef HL_NEON_MATRIX_KERNEL_CUH_
-#define HL_NEON_MATRIX_KERNEL_CUH_
-#include "hl_matrix_type.cuh"
-#define VECTOR_SIZE     16
-/* number of float in vector */
-#define     VECTOR_LEN      4
-#define     VECTOR_SET      vdupq_n_f32
-inline bool hl_check_align(size_t size) {
-  return !(size & (VECTOR_SIZE - 1));
-}
-inline bool hl_check_align(void *ptr) {
-  return hl_check_align(reinterpret_cast<size_t>(ptr));
-}
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  float32x4_t rev = vrev64q_f32(mm);
-  float32x4_t tmp1 = agg.vecOp(rev, rev);
-  float32x2_t lo = vget_high_f32(rev);
-  float32x2_t hi = vget_low_f32(rev);
-  float32x4_t tmp2 = vcombine_f32(hi, lo);
-  float32x4_t ret = agg.vecOp(tmp1, tmp2);
-  return vgetq_lane_f32(ret, 0);
-}
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-  for (int i = 0; i < dimM; i++, A += lda) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
-      mm = agg.vecOp(mm, op.vecOp(*a));
-    }
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-        tmp = agg(tmp, op(a[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-      dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-  for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    vecType *b = (vecType*)(B);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
-        mm = agg.vecOp(mm, op.vecOp(*a, *b));
-    }
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-          tmp = agg(tmp, op(a[j], b[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda,
-                         real *B, int ldb) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-/*
- * MaxRow greater than or equal dimN
- * dimN is multiples of VECTOR_LEN
- * so rem <= MaxRow / VECTOR_LEN
- */
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-    }
-  }
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
-  }
-}
-/*
- * dimN is multiples of VECTOR_LEN
- * dimN greater than Step
- */
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-      }
-    }
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
-  }
-}
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda,
-                               real *B, int ldb) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    vecType *b = (vecType*)(B + i * ldb);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-    }
-  }
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    B += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
-  }
-}
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      vecType *b = (vecType*)(B + i * ldb);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-      }
-    }
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(
-        agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  }
-}
-#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
@@ -328,6 +328,208 @@ public:
  INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
 };
+#ifdef PADDLE_USE_SSE3
+#ifndef PADDLE_TYPE_DOUBLE
+template <>
+class add<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(a, b);
+  }
+};
+template <>
+class add_scale<__m128> {
+private:
+  const __m128 p1;
+  const __m128 p2;
+public:
+  INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
+  }
+};
+template <>
+class sub<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_sub_ps(a, b);
+  }
+};
+template <>
+class mul<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_mul_ps(a, b);
+  }
+};
+template <>
+class div<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_div_ps(a, b);
+  }
+};
+template <>
+class min<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_min_ps(a, b);
+  }
+};
+template <>
+class max<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_max_ps(a, b);
+  }
+};
+#else
+template <>
+class add<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(a, b);
+  }
+};
+template <>
+class add_scale<__m128d> {
+private:
+  const __m128d p1;
+  const __m128d p2;
+public:
+  INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
+  }
+};
+template <>
+class sub<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_sub_pd(a, b);
+  }
+};
+template <>
+class mul<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_mul_pd(a, b);
+  }
+};
+template <>
+class div<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_div_pd(a, b);
+  }
+};
+template <>
+class min<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_min_pd(a, b);
+  }
+};
+template <>
+class max<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_max_pd(a, b);
+  }
+};
+#endif  // PADDLE_TYPE_DOUBLE
+#endif  // PADDLE_USE_SSE3
+#ifdef PADDLE_USE_NEON
+#ifndef PADDLE_TYPE_DOUBLE
+template <>
+class add<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+template <>
+class add_scale<float32x4_t> {
+private:
+  const float32x4_t p1;
+  const float32x4_t p2;
+public:
+  INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
+      : p1(s1), p2(s2) {}
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vaddq_f32(vmulq_f32(p1, a), vmulq_f32(p2, b));
+  }
+};
+template <>
+class sub<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vsubq_f32(a, b);
+  }
+};
+template <>
+class mul<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+template <>
+class div<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    float32x4_t tmp = vrecpeq_f32(b);
+    return vmulq_f32(a, tmp);
+  }
+};
+template <>
+class min<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vminq_f32(a, b);
+  }
+};
+template <>
+class max<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmaxq_f32(a, b);
+  }
+};
+#else
+#error To be implemented
+#endif  // PADDLE_TYPE_DOUBLE
+#endif  // PADDLE_USE_NEON
 }  // namespace binary
 }  // namespace hppl