Support scalar computing.

32a85081 · Jiangtao Hu · 21be601b · 32a85081 · 32a85081 · 32a85081
8 changed file
--- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
@@ -17,10 +17,9 @@ limitations under the License. */

 #include <stdio.h>
 #include "hl_base.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include "hl_neon_matrix_kernel.cuh"
-#else
-#include "hl_sse_matrix_kernel.cuh"
+
+#ifndef __CUDA_ARCH__
+#include "hl_cpu_matrix_kernel_detail.cuh"
 #endif

 /**
@@ -114,35 +113,6 @@ void hl_cpu_apply_quaternary_op(Op op,
  }
 }

-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda,
-                      real *B, int ldb) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
-
 template <class Agg, class Op, class Saver>
 void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
                          int dimM, int dimN,

--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -13,26 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */


-#ifndef HL_SSE_MATRIX_KERNEL_CUH_
-#define HL_SSE_MATRIX_KERNEL_CUH_
+#ifndef HL_MATRIX_KERNEL_DETAIL_CUH_
+#define HL_MATRIX_KERNEL_DETAIL_CUH_

 #include "hl_matrix_type.cuh"

-#define VECTOR_SIZE     16
-
-#ifndef PADDLE_TYPE_DOUBLE
-/* number of float in vector */
-#define     VECTOR_LEN      4
-#define     VECTOR_SET      _mm_set_ps1
-#else
-#if   defined(__APPLE__) || defined(__OSX__)
-#define     _mm_set_pd1     _mm_set1_pd
-#endif
-/* number of double in vector */
-#define     VECTOR_LEN      2
-#define     VECTOR_SET      _mm_set_pd1
-#endif
-
 inline bool hl_check_align(size_t size) {
  return !(size & (VECTOR_SIZE - 1));
 }
@@ -41,27 +26,63 @@ inline bool hl_check_align(void *ptr) {
  return hl_check_align(reinterpret_cast<size_t>(ptr));
 }

-#ifndef PADDLE_TYPE_DOUBLE
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  __m128 lo = _mm_unpacklo_ps(mm, mm);
-  __m128 hi = _mm_unpackhi_ps(mm, mm);
-  __m128 tmp1 = agg.vecOp(lo, hi);
-  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
-  __m128 ret = agg.vecOp(tmp1, tmp2);
+template <class Agg, class Op, class Saver>
+void hl_matrix_row_op(Agg agg, Op op, Saver sv,
+                      int dimM, int dimN,
+                      real *dst, int ld,
+                      real *A, int lda) {
+  for (int i = 0; i < dimM; i++) {
+    real tmp = agg.init();
+    for (int j = 0; j < dimN; j++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[i*ld] = sv(dst[i*ld], tmp);
+  }
+}

-  return _mm_cvtss_f32(ret);
+template <class Agg, class Op, class Saver>
+void hl_matrix_row_op(Agg agg, Op op, Saver sv,
+                      int dimM, int dimN,
+                      real *dst, int ld,
+                      real *A, int lda,
+                      real *B, int ldb) {
+  for (int i = 0; i < dimM; i++) {
+    real tmp = agg.init();
+    for (int j = 0; j < dimN; j++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[i*ld] = sv(dst[i*ld], tmp);
+  }
 }
-#else
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  __m128d lo = _mm_unpacklo_pd(mm, mm);
-  __m128d hi = _mm_unpackhi_pd(mm, mm);
-  __m128d ret = agg.vecOp(lo, hi);
-
-  return _mm_cvtsd_f64(ret);
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda,
+                         real *B, int ldb) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
 }
-#endif

 template <class Agg, class Op, class Saver>
 void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
@@ -118,35 +139,6 @@ void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
  }
 }

-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda,
-                         real *B, int ldb) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
 /*
 * MaxRow greater than or equal dimN
 * dimN is multiples of VECTOR_LEN
@@ -315,4 +307,4 @@ void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
  }
 }

-#endif /* HL_SSE_MATRIX_KERNEL_CUH_ */
+#endif /* HL_MATRIX_KERNEL_DETAIL_CUH_ */
--- a/paddle/cuda/include/hl_cpu_scalar.cuh
+++ b/paddle/cuda/include/hl_cpu_scalar.cuh
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_CPU_SCALAR_CUH_
+#define HL_CPU_SCALAR_CUH_
+
+#ifndef PADDLE_TYPE_DOUBLE
+/* size of float */
+#define VECTOR_SIZE     4
+#else
+/* size of double */
+#define VECTOR_SIZE     8
+#endif
+
+typedef real vecType;
+
+inline void set_zero(vecType &mm) { mm = (vecType) 0.0f; }
+
+/* Consider a real as a vector */
+#define VECTOR_LEN      1
+#define VECTOR_SET      set_zero
+
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  return mm;
+}
+
+#endif  // HL_CPU_SCALAR_CUH_
--- a/paddle/cuda/include/hl_cpu_simd_neon.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_neon.cuh
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_CPU_SIMD_NEON_CUH_
+#define HL_CPU_SIMD_NEON_CUH_
+
+#include <arm_neon.h>
+
+#define VECTOR_SIZE     16
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+typedef float32x4_t vecType;
+
+/* number of float in vector */
+#define VECTOR_LEN      4
+#define VECTOR_SET      vdupq_n_f32
+
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  float32x4_t rev = vrev64q_f32(mm);
+  float32x4_t tmp1 = agg.vecOp(rev, rev);
+  float32x2_t lo = vget_high_f32(rev);
+  float32x2_t hi = vget_low_f32(rev);
+  float32x4_t tmp2 = vcombine_f32(hi, lo);
+  float32x4_t ret = agg.vecOp(tmp1, tmp2);
+
+  return vgetq_lane_f32(ret, 0);
+}
+
+#else
+
+#ifdef __aarch64__
+typedef float64x2_t vecType;
+
+/* number of float in vector */
+#define VECTOR_LEN      2
+#define VECTOR_SET      vdupq_n_f64
+
+#error To be implemented
+#else
+#error NEON instructions does not support double precision
+#endif
+
+#endif
+
+#endif  // HL_CPU_SIMD_NEON_CUH_
--- a/paddle/cuda/include/hl_cpu_simd_sse.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_sse.cuh
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_SIMD_SSE_CUH_
+#define HL_SIMD_SSE_CUH_
+
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#define VECTOR_SIZE     16
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+typedef __m128  vecType;
+
+/* number of float in vector */
+#define VECTOR_LEN      4
+#define VECTOR_SET      _mm_set_ps1
+
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  __m128 lo = _mm_unpacklo_ps(mm, mm);
+  __m128 hi = _mm_unpackhi_ps(mm, mm);
+  __m128 tmp1 = agg.vecOp(lo, hi);
+  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
+  __m128 ret = agg.vecOp(tmp1, tmp2);
+
+  return _mm_cvtss_f32(ret);
+}
+
+#else
+
+typedef __m128d vecType;
+
+/* number of double in vector */
+#define VECTOR_LEN      2
+#if defined(__APPLE__) || defined(__OSX__)
+#define _mm_set_pd1     _mm_set1_pd
+#endif
+#define VECTOR_SET      _mm_set_pd1
+
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  __m128d lo = _mm_unpacklo_pd(mm, mm);
+  __m128d hi = _mm_unpackhi_pd(mm, mm);
+  __m128d ret = agg.vecOp(lo, hi);
+
+  return _mm_cvtsd_f64(ret);
+}
+
+#endif
+
+#endif  // HL_SIMD_SSE_CUH_
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -52,7 +52,11 @@ public:
  }
 };

-#ifdef __CUDA_ARCH__
+#if defined(__SSE3__)
+#include "hl_matrix_base_sse.cuh"
+#elif (defined(__ARM__NEON__) || defined(__ARM_NEON))
+#include "hl_matrix_base_neon.cuh"
+#else
 typedef BaseOp SSESum;
 typedef BaseOp SSEMax;
 typedef BaseOp SSEMin;
@@ -66,10 +70,6 @@ typedef BaseOp SSESquaredDiff;
 typedef BaseOp SSEFirst;
 typedef BaseOp SSESecond;
 typedef BaseOp SSEClassificationError;
-#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
-#include "hl_matrix_base_neon.cuh"
-#else
-#include "hl_matrix_base_sse.cuh"
 #endif

 namespace aggregate {

--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -17,29 +17,19 @@ limitations under the License. */

 #include "hl_base.h"

-#if defined(__CUDA_ARCH__)
+#ifdef __CUDA_ARCH__
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
 #else
 typedef double2 vecType;
 #endif
-#elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
-#include <arm_neon.h>
-#ifndef PADDLE_TYPE_DOUBLE
-typedef float32x4_t  vecType;
-#else
-#error NEON instructions does not support double precision
-#endif
+#elif defined(__SSE3__)
+#include "hl_cpu_simd_sse.cuh"
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include "hl_cpu_simd_neon.cuh"
 #else
-#include <mmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#ifndef PADDLE_TYPE_DOUBLE
-typedef __m128  vecType;
-#else
-typedef __m128d vecType;
-#endif
+#include "hl_cpu_scalar.cuh"
 #endif

 #ifdef __CUDA_ARCH__

--- a/paddle/cuda/include/hl_neon_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_NEON_MATRIX_KERNEL_CUH_
-#define HL_NEON_MATRIX_KERNEL_CUH_
-
-#include "hl_matrix_type.cuh"
-
-#define VECTOR_SIZE     16
-
-/* number of float in vector */
-#define     VECTOR_LEN      4
-#define     VECTOR_SET      vdupq_n_f32
-
-inline bool hl_check_align(size_t size) {
-  return !(size & (VECTOR_SIZE - 1));
-}
-
-inline bool hl_check_align(void *ptr) {
-  return hl_check_align(reinterpret_cast<size_t>(ptr));
-}
-
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  float32x4_t rev = vrev64q_f32(mm);
-  float32x4_t tmp1 = agg.vecOp(rev, rev);
-  float32x2_t lo = vget_high_f32(rev);
-  float32x2_t hi = vget_low_f32(rev);
-  float32x4_t tmp2 = vcombine_f32(hi, lo);
-  float32x4_t ret = agg.vecOp(tmp1, tmp2);
-
-  return vgetq_lane_f32(ret, 0);
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-  for (int i = 0; i < dimM; i++, A += lda) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
-      mm = agg.vecOp(mm, op.vecOp(*a));
-    }
-
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-        tmp = agg(tmp, op(a[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-      dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-  for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    vecType *b = (vecType*)(B);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
-        mm = agg.vecOp(mm, op.vecOp(*a, *b));
-    }
-
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-          tmp = agg(tmp, op(a[j], b[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda,
-                         real *B, int ldb) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
-/*
- * MaxRow greater than or equal dimN
- * dimN is multiples of VECTOR_LEN
- * so rem <= MaxRow / VECTOR_LEN
- */
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-    }
-  }
-
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
-  }
-}
-
-/*
- * dimN is multiples of VECTOR_LEN
- * dimN greater than Step
- */
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-      }
-    }
-
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
-  }
-}
-
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda,
-                               real *B, int ldb) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    vecType *b = (vecType*)(B + i * ldb);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-    }
-  }
-
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    B += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
-  }
-}
-
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      vecType *b = (vecType*)(B + i * ldb);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-      }
-    }
-
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(
-        agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  }
-}
-
-#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */