diff --git a/paddle/cuda/include/hl_cpu_scalar.cuh b/paddle/cuda/include/hl_cpu_scalar.cuh
index cddf08ce6b615dbb62885daf29c8c378892d40bf..93043cd4bc030ef525d5bcf8d83196f2ce92eec6 100644
--- a/paddle/cuda/include/hl_cpu_scalar.cuh
+++ b/paddle/cuda/include/hl_cpu_scalar.cuh
@@ -40,30 +40,6 @@ INLINE real hl_vec_set(const real r) {
   return r;
 }
 
-INLINE real hl_vec_max(const real a, const real b) {
-  return a > b ? a : b;
-}
-
-INLINE real hl_vec_min(const real a, const real b) {
-  return a > b ? b : a;
-}
-
-INLINE real hl_vec_add(const real a, const real b) {
-  return a + b;
-}
-
-INLINE real hl_vec_sub(const real a, const real b) {
-  return a - b;
-}
-
-INLINE real hl_vec_mul(const real a, const real b) {
-  return a * b;
-}
-
-INLINE real hl_vec_div(const real a, const real b) {
-  return a / b;
-}
-
 INLINE real hl_vec_classification_error(const real a,
                                         const real b,
                                         const real p,
diff --git a/paddle/cuda/include/hl_cpu_simd_neon.cuh b/paddle/cuda/include/hl_cpu_simd_neon.cuh
index 9ff360c576fe1f948999144e6fcba8c111d29008..0b1cf4abdc4d5ef2a640c75587308f7f082b854b 100644
--- a/paddle/cuda/include/hl_cpu_simd_neon.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_neon.cuh
@@ -44,31 +44,6 @@ inline float32x4_t hl_vec_set(const real f) {
   return vdupq_n_f32(f);
 }
 
-inline float32x4_t hl_vec_max(const float32x4_t a, const float32x4_t b) {
-  return vmaxq_f32(a, b);
-}
-
-inline float32x4_t hl_vec_min(const float32x4_t a, const float32x4_t b) {
-  return vminq_f32(a, b);
-}
-
-inline float32x4_t hl_vec_add(const float32x4_t a, const float32x4_t b) {
-  return vaddq_f32(a, b);
-}
-
-inline float32x4_t hl_vec_sub(const float32x4_t a, const float32x4_t b) {
-  return vsubq_f32(a, b);
-}
-
-inline float32x4_t hl_vec_mul(const float32x4_t a, const float32x4_t b) {
-  return vmulq_f32(a, b);
-}
-
-inline float32x4_t hl_vec_div(const float32x4_t a, const float32x4_t b) {
-  float32x4_t tmp = vrecpeq_f32(b);
-  return vmulq_f32(a, tmp);
-}
-
 inline float32x4_t hl_vec_classification_error(const float32x4_t a,
                                                const float32x4_t b,
                                                const float32x4_t p,
diff --git a/paddle/cuda/include/hl_cpu_simd_sse.cuh b/paddle/cuda/include/hl_cpu_simd_sse.cuh
index 9a918770b14d0c106b89bfbc82ef5f7feec17b8c..a104b626220f473324fc3c99e7cd305c3e86f3db 100644
--- a/paddle/cuda/include/hl_cpu_simd_sse.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_sse.cuh
@@ -45,30 +45,6 @@ inline __m128 hl_vec_set(const real f) {
   return _mm_set_ps1(f);
 }
 
-inline __m128 hl_vec_max(const __m128 a, const __m128 b) {
-  return _mm_max_ps(a, b);
-}
-
-inline __m128 hl_vec_min(const __m128 a, const __m128 b) {
-  return _mm_min_ps(a, b);
-}
-
-inline __m128 hl_vec_add(const __m128 a, const __m128 b) {
-  return _mm_add_ps(a, b);
-}
-
-inline __m128 hl_vec_sub(const __m128 a, const __m128 b) {
-  return _mm_sub_ps(a, b);
-}
-
-inline __m128 hl_vec_mul(const __m128 a, const __m128 b) {
-  return _mm_mul_ps(a, b);
-}
-
-inline __m128 hl_vec_div(const __m128 a, const __m128 b) {
-  return _mm_div_ps(a, b);
-}
-
 inline __m128 hl_vec_classification_error(const __m128 a,
                                           const __m128 b,
                                           const __m128 p,
@@ -103,30 +79,6 @@ inline __m128d hl_vec_set(const real d) {
 #endif
 }
 
-inline __m128d hl_vec_max(const __m128d a, const __m128d b) {
-  return _mm_max_pd(a, b);
-}
-
-inline __m128d hl_vec_min(const __m128d a, const __m128d b) {
-  return _mm_min_pd(a, b);
-}
-
-inline __m128d hl_vec_add(const __m128d a, const __m128d b) {
-  return _mm_add_pd(a, b);
-}
-
-inline __m128d hl_vec_sub(const __m128d a, const __m128d b) {
-  return _mm_sub_pd(a, b);
-}
-
-inline __m128d hl_vec_mul(const __m128d a, const __m128d b) {
-  return _mm_mul_pd(a, b);
-}
-
-inline __m128d hl_vec_div(const __m128d a, const __m128d b) {
-  return _mm_div_pd(a, b);
-}
-
 inline __m128d hl_vec_classification_error(const __m128d a,
                                            const __m128d b,
                                            const __m128d p,
diff --git a/paddle/cuda/include/hl_matrix_base_detail.cuh b/paddle/cuda/include/hl_matrix_base_detail.cuh
index 50079ed53de7a7b6026284afb73b5335096c145b..de1fd17d524a486cc15af721731d2e815f17263e 100644
--- a/paddle/cuda/include/hl_matrix_base_detail.cuh
+++ b/paddle/cuda/include/hl_matrix_base_detail.cuh
@@ -16,13 +16,14 @@ limitations under the License. */
 #define HL_MATRIX_BASE_DETAIL_CUH_
 
 #include "hl_matrix_type.cuh"
+#include "hl_tensor_ops.h"
 
 namespace aggregate {
 class SSESum {
 public:
   static const bool sse = VECTOR_SIMD;
   INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hl_vec_add(a, b);
+    return hppl::binary::add<vecType>()(a, b);
   }
 };
 
@@ -30,7 +31,7 @@ class SSEMax {
 public:
   static const bool sse = VECTOR_SIMD;
   INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hl_vec_max(a, b);
+    return hppl::binary::max<vecType>()(a, b);
   }
 };
 
@@ -38,7 +39,7 @@ class SSEMin {
 public:
   static const bool sse = VECTOR_SIMD;
   INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hl_vec_min(a, b);
+    return hppl::binary::min<vecType>()(a, b);
   }
 };
 }  // namespace aggregate
@@ -59,7 +60,7 @@ class SSEAdd {
 public:
   static const bool sse = VECTOR_SIMD;
   INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hl_vec_add(a, b);
+    return hppl::binary::add<vecType>()(a, b);
   }
 };
 
@@ -77,7 +78,7 @@ public:
     mp2 = hl_vec_set(p2);
   }
   INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hl_vec_add(hl_vec_mul(mp1, a), hl_vec_mul(mp2, b));
+    return hppl::binary::add_scale<vecType>(mp1, mp2)(a, b);
   }
 };
 
@@ -85,7 +86,7 @@ class SSESub {
 public:
   static const bool sse = VECTOR_SIMD;
   INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hl_vec_sub(a, b);
+    return hppl::binary::sub<vecType>()(a, b);
   }
 };
 
@@ -93,7 +94,7 @@ class SSEMul {
 public:
   static const bool sse = VECTOR_SIMD;
   INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hl_vec_mul(a, b);
+    return hppl::binary::mul<vecType>()(a, b);
   }
 };
 
@@ -101,7 +102,7 @@ class SSEDiv {
 public:
   static const bool sse = VECTOR_SIMD;
   INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hl_vec_div(a, b);
+    return hppl::binary::div<vecType>()(a, b);
   }
 };
 
@@ -109,7 +110,8 @@ class SSESquaredDiff {
 public:
   static const bool sse = VECTOR_SIMD;
   INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hl_vec_mul(hl_vec_sub(a, b), hl_vec_sub(a, b));
+    vecType tmp = hppl::binary::sub<vecType>()(a, b);
+    return hppl::binary::mul<vecType>()(tmp, tmp);
   }
 };
 
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index 12c717b612da907ac7aedf3e7787bdbb96948be2..e18235219bd9f78dd87a92d448cb290d9a4904a1 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -38,10 +38,12 @@ typedef double2 vecType;
 #endif
 #elif defined(__SSE3__)
 #include "hl_cpu_simd_sse.cuh"
+#define PADDLE_USE_SSE3
 #elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !defined(__NVCC__)
 // Currently nvcc does not support neon intrinsic.
 // TODO: Extract simd intrinsic implementation from .cu files.
 #include "hl_cpu_simd_neon.cuh"
+#define PADDLE_USE_NEON
 #else
 #include "hl_cpu_scalar.cuh"
 #endif
diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h
index 7945b98201b1812790fb0d53123e9ee007640485..523503f5fec52ad01cff1bf9bbc04f271b0e4867 100644
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
@@ -328,6 +328,208 @@ public:
   INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
 };
 
+#ifdef PADDLE_USE_SSE3
+#ifndef PADDLE_TYPE_DOUBLE
+template <>
+class add<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(a, b);
+  }
+};
+
+template <>
+class add_scale<__m128> {
+private:
+  const __m128 p1;
+  const __m128 p2;
+
+public:
+  INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
+  }
+};
+
+template <>
+class sub<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_sub_ps(a, b);
+  }
+};
+
+template <>
+class mul<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_mul_ps(a, b);
+  }
+};
+
+template <>
+class div<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_div_ps(a, b);
+  }
+};
+
+template <>
+class min<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_min_ps(a, b);
+  }
+};
+
+template <>
+class max<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_max_ps(a, b);
+  }
+};
+#else
+template <>
+class add<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(a, b);
+  }
+};
+
+template <>
+class add_scale<__m128d> {
+private:
+  const __m128d p1;
+  const __m128d p2;
+
+public:
+  INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
+  }
+};
+
+template <>
+class sub<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_sub_pd(a, b);
+  }
+};
+
+template <>
+class mul<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_mul_pd(a, b);
+  }
+};
+
+template <>
+class div<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_div_pd(a, b);
+  }
+};
+
+template <>
+class min<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_min_pd(a, b);
+  }
+};
+
+template <>
+class max<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_max_pd(a, b);
+  }
+};
+#endif  // PADDLE_TYPE_DOUBLE
+#endif  // PADDLE_USE_SSE3
+
+#ifdef PADDLE_USE_NEON
+#ifndef PADDLE_TYPE_DOUBLE
+template <>
+class add<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+
+template <>
+class add_scale<float32x4_t> {
+private:
+  const float32x4_t p1;
+  const float32x4_t p2;
+
+public:
+  INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
+      : p1(s1), p2(s2) {}
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vaddq_f32(vmulq_f32(p1, a), vmulq_f32(p2, b));
+  }
+};
+
+template <>
+class sub<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vsubq_f32(a, b);
+  }
+};
+
+template <>
+class mul<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+
+template <>
+class div<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    float32x4_t tmp = vrecpeq_f32(b);
+    return vmulq_f32(a, tmp);
+  }
+};
+
+template <>
+class min<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vminq_f32(a, b);
+  }
+};
+
+template <>
+class max<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmaxq_f32(a, b);
+  }
+}
+#else
+#error To be implemented
+#endif  // PADDLE_TYPE_DOUBLE
+#endif  // PADDLE_USE_NEON
+
 }  // namespace binary
 }  // namespace hppl