diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp index 6cd4538c4540ff11d91a6f49d088ad38f6d992e7..e6f27b772562789e07807b2b56c1f9d73bf373a9 100644 --- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp @@ -28,7 +28,6 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { Tensor filter = *param.Filter(); Tensor *output = param.Output(); output->mutable_data(); - int groups = param.Groups(); std::vector strides = param.Strides(); std::vector paddings = param.Paddings(); @@ -40,7 +39,6 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { std::vector filter_shape_vec(framework::vectorize(filter.dims())); std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; std::vector col_shape_vec(1 + 2 * data_dim); col_shape_vec[0] = input->dims()[1] / groups; @@ -61,18 +59,13 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } - // DLOG << " col_shape = " << col_shape; - // DLOG << " col_matrix_shape = " << col_matrix_shape; framework::DDim input_shape = framework::slice_ddim( input->dims(), 1, static_cast(input->dims().size())); - // DLOG << " input_shape = " << input_shape; framework::DDim filter_matrix_shape = {filter.dims()[0], filter.numel() / filter.dims()[0]}; filter.Resize(filter_matrix_shape); - // DLOG << " filter.dims() = " << filter.dims(); - framework::DDim output_matrix_shape = { output->dims()[1], output->numel() / (output->dims()[0] * output->dims()[1])}; @@ -87,8 +80,6 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { for (int i = 0; i < batch_size; i++) { Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - // DLOG << " in_batch.dims() = " << in_batch.dims(); - // DLOG << " out_batch.dims() = " << out_batch.dims(); for (int g = 0; g < groups; g++) { Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); @@ -111,13 +102,9 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { // gemm Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - // DLOG << " out_slice " << out_slice.dims(); - // DLOG << " filter_slice " << filter_slice.dims(); - // DLOG << " col_matrix " << col_matrix.dims(); math::matmul(filter_slice, false, col_matrix, false, static_cast(1), &out_slice, static_cast(0)); - auto filter_ptr = filter_slice.data(); } } } diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index fc243766bf9f8760178ac4efb0dfdd11a5742fa9..d69ae00d4aed04e42736afd10f51c88022387e29 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -114,10 +114,12 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb, for (j = 0; j < n - paddingN; j += NR) { for (i = 0; i < k; ++i) { Bij = &B(i, j); - *buffer++ = *Bij; - *buffer++ = *(Bij + 1); - *buffer++ = *(Bij + 2); - *buffer++ = *(Bij + 3); + asm volatile( + "vld1.32 {q0}, [%[Bij]] \n\t" + "vst1.32 {q0}, [%[buffer]]! \n\t" + : [buffer] "+r"(buffer) + : [Bij] "r"(Bij) + : "memory", "q0"); } } if (paddingN != 0) { diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index 87d65bdd28a42c4510668345ad7ce7058eb2cdf8..e510f4cdc9c6a1163914f72f73f1722529df9e16 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -20,9 +20,9 @@ limitations under the License. */ #define C(i, j) C[(i)*ldc + (j)] // 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k -#define MC 384 -#define KC 384 -#define NC 4096 +#define MC 128 +#define KC 128 +#define NC 1024 #define MR 4 #define NR 4