diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
index 6cd4538c4540ff11d91a6f49d088ad38f6d992e7..e6f27b772562789e07807b2b56c1f9d73bf373a9 100644
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -28,7 +28,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   Tensor filter = *param.Filter();
   Tensor *output = param.Output();
   output->mutable_data<float>();
-
   int groups = param.Groups();
   std::vector<int> strides = param.Strides();
   std::vector<int> paddings = param.Paddings();
@@ -40,7 +39,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
 
   std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
   std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-
   size_t data_dim = filter_shape_vec.size() - 2;
   std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
   col_shape_vec[0] = input->dims()[1] / groups;
@@ -61,18 +59,13 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }
-  //  DLOG << " col_shape = " << col_shape;
-  //  DLOG << " col_matrix_shape = " << col_matrix_shape;
 
   framework::DDim input_shape = framework::slice_ddim(
       input->dims(), 1, static_cast<int>(input->dims().size()));
-  //  DLOG << " input_shape = " << input_shape;
 
   framework::DDim filter_matrix_shape = {filter.dims()[0],
                                          filter.numel() / filter.dims()[0]};
   filter.Resize(filter_matrix_shape);
-  //  DLOG << " filter.dims() = " << filter.dims();
-
   framework::DDim output_matrix_shape = {
       output->dims()[1],
       output->numel() / (output->dims()[0] * output->dims()[1])};
@@ -87,8 +80,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   for (int i = 0; i < batch_size; i++) {
     Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
     Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    //    DLOG << " in_batch.dims() = " << in_batch.dims();
-    //    DLOG << " out_batch.dims() = " << out_batch.dims();
 
     for (int g = 0; g < groups; g++) {
       Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
@@ -111,13 +102,9 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
       // gemm
       Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
       Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      //      DLOG << " out_slice " << out_slice.dims();
-      //      DLOG << " filter_slice " << filter_slice.dims();
-      //      DLOG << " col_matrix " << col_matrix.dims();
       math::matmul<float>(filter_slice, false, col_matrix, false,
                           static_cast<float>(1), &out_slice,
                           static_cast<float>(0));
-      auto filter_ptr = filter_slice.data<float>();
     }
   }
 }
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index fc243766bf9f8760178ac4efb0dfdd11a5742fa9..d69ae00d4aed04e42736afd10f51c88022387e29 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -114,10 +114,12 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
   for (j = 0; j < n - paddingN; j += NR) {
     for (i = 0; i < k; ++i) {
       Bij = &B(i, j);
-      *buffer++ = *Bij;
-      *buffer++ = *(Bij + 1);
-      *buffer++ = *(Bij + 2);
-      *buffer++ = *(Bij + 3);
+      asm volatile(
+          "vld1.32    {q0}, [%[Bij]]        \n\t"
+          "vst1.32    {q0}, [%[buffer]]!    \n\t"
+          : [buffer] "+r"(buffer)
+          : [Bij] "r"(Bij)
+          : "memory", "q0");
     }
   }
   if (paddingN != 0) {
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index 87d65bdd28a42c4510668345ad7ce7058eb2cdf8..e510f4cdc9c6a1163914f72f73f1722529df9e16 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -20,9 +20,9 @@ limitations under the License. */
 #define C(i, j) C[(i)*ldc + (j)]
 
 // 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
-#define MC 384
-#define KC 384
-#define NC 4096
+#define MC 128
+#define KC 128
+#define NC 1024
 #define MR 4
 #define NR 4