optimize gemm

260f93fe · zhaojiaying01 · 6db73b7a · 260f93fe · 260f93fe · 260f93fe
3 changed file
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -28,7 +28,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  Tensor filter = *param.Filter();
  Tensor *output = param.Output();
  output->mutable_data<float>();
  int groups = param.Groups();
  std::vector<int> strides = param.Strides();
  std::vector<int> paddings = param.Paddings();
@@ -40,7 +39,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
  size_t data_dim = filter_shape_vec.size() - 2;
  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
  col_shape_vec[0] = input->dims()[1] / groups;
@@ -61,18 +59,13 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
  }
-  //  DLOG << " col_shape = " << col_shape;
-  //  DLOG << " col_matrix_shape = " << col_matrix_shape;
  framework::DDim input_shape = framework::slice_ddim(
      input->dims(), 1, static_cast<int>(input->dims().size()));
-  //  DLOG << " input_shape = " << input_shape;
  framework::DDim filter_matrix_shape = {filter.dims()[0],
                                         filter.numel() / filter.dims()[0]};
  filter.Resize(filter_matrix_shape);
-  //  DLOG << " filter.dims() = " << filter.dims();
  framework::DDim output_matrix_shape = {
      output->dims()[1],
      output->numel() / (output->dims()[0] * output->dims()[1])};
@@ -87,8 +80,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
  for (int i = 0; i < batch_size; i++) {
    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    //    DLOG << " in_batch.dims() = " << in_batch.dims();
-    //    DLOG << " out_batch.dims() = " << out_batch.dims();
    for (int g = 0; g < groups; g++) {
      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
@@ -111,13 +102,9 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      //      DLOG << " out_slice " << out_slice.dims();
-      //      DLOG << " filter_slice " << filter_slice.dims();
-      //      DLOG << " col_matrix " << col_matrix.dims();
      math::matmul<float>(filter_slice, false, col_matrix, false,
                          static_cast<float>(1), &out_slice,
                          static_cast<float>(0));
-      auto filter_ptr = filter_slice.data<float>();
    }
  }
 }

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -114,10 +114,12 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
  for (j = 0; j < n - paddingN; j += NR) {
    for (i = 0; i < k; ++i) {
      Bij = &B(i, j);
-      *buffer++ = *Bij;
+      asm volatile(
-      *buffer++ = *(Bij + 1);
+          "vld1.32    {q0}, [%[Bij]]        \n\t"
-      *buffer++ = *(Bij + 2);
+          "vst1.32    {q0}, [%[buffer]]!    \n\t"
-      *buffer++ = *(Bij + 3);
+          : [buffer] "+r"(buffer)
+          : [Bij] "r"(Bij)
+          : "memory", "q0");
    }
  }
  if (paddingN != 0) {

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -20,9 +20,9 @@ limitations under the License. */
 #define C(i, j) C[(i)*ldc + (j)]
 // 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
-#define MC 384
+#define MC 128
-#define KC 384
+#define KC 128
-#define NC 4096
+#define NC 1024
 #define MR 4
 #define NR 4