// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at

//     http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/backends/arm/math/packed_sgemm.h"
#include <arm_neon.h>
#include "lite/backends/arm/math/conv_block_utils.h"

namespace paddle {
namespace lite {
namespace arm {
namespace math {

#ifdef __aarch64__
void prepackA_8x12(float *out,
                   const float *in,
                   float alpha,
                   int ldin,
                   int m0,
                   int mmax,
                   int k0,
                   int kmax);

void prepackA_trans_8x12(float *out,
                         const float *in,
                         float alpha,
                         int ldin,
                         int m0,
                         int mmax,
                         int k0,
                         int kmax);

void sgemm_prepacked_8x12(bool is_transB,
                          int M,
                          int N,
                          int K,
                          const float *A_packed,
                          const float *B,
                          int ldb,
                          float beta,
                          float *C,
                          int ldc,
                          const float *bias,
                          bool has_bias,
                          const operators::ActivationParam act_param,
                          ARMContext *ctx);

void prepackA_4x8(float *out,
                  const float *in,
                  float alpha,
                  int ldin,
                  int m0,
                  int mmax,
                  int k0,
                  int kmax);

void prepackA_trans_4x8(float *out,
                        const float *in,
                        float alpha,
                        int ldin,
                        int m0,
                        int mmax,
                        int k0,
                        int kmax);

void sgemm_prepacked_4x8(bool is_transB,
                         int M,
                         int N,
                         int K,
                         const float *A_packed,
                         const float *B,
                         int ldb,
                         float beta,
                         float *C,
                         int ldc,
                         const float *bias,
                         bool has_bias,
                         const operators::ActivationParam act_param,
                         ARMContext *ctx);

void pack_m4(float *out,
             const float *in,
             float alpha,
             int ldin,
             int m0,
             int mmax,
             int k0,
             int kmax);

void pack_trans_m4(float *out,
                   const float *in,
                   float alpha,
                   int ldin,
                   int m0,
                   int mmax,
                   int k0,
                   int kmax);

void sgemm_prepacked_4x4(bool is_transB,
                         int M,
                         int N,
                         int K,
                         const float *A_packed,
                         const float *B,
                         int ldb,
                         float beta,
                         float *C,
                         int ldc,
                         const float *bias,
                         bool has_bias,
                         const operators::ActivationParam act_param,
                         ARMContext *ctx);
#else
// for kA72
void prepackA_6x8(float *out,
                  const float *in,
                  float alpha,
                  int ldin,
                  int m0,
                  int mmax,
                  int k0,
                  int kmax);

void prepackA_trans_6x8(float *out,
                        const float *in,
                        float alpha,
                        int ldin,
                        int m0,
                        int mmax,
                        int k0,
                        int kmax);
// for kA73
void prepackA_4x8(float *out,
                  const float *in,
                  float alpha,
                  int ldin,
                  int m0,
                  int mmax,
                  int k0,
                  int kmax);

void prepackA_trans_4x8(float *out,
                        const float *in,
                        float alpha,
                        int ldin,
                        int m0,
                        int mmax,
                        int k0,
                        int kmax);

// for kA72, 6x8
void sgemm_prepacked_6x8(bool is_transB,
                         int M,
                         int N,
                         int K,
                         const float *A_packed,
                         const float *B,
                         int ldb,
                         float beta,
                         float *C,
                         int ldc,
                         const float *bias,
                         bool has_bias,
                         const operators::ActivationParam act_param,
                         ARMContext *ctx);
// for kA73, 4x8
void sgemm_prepacked_4x8(bool is_transB,
                         int M,
                         int N,
                         int K,
                         const float *A_packed,
                         const float *B,
                         int ldb,
                         float beta,
                         float *C,
                         int ldc,
                         const float *bias,
                         bool has_bias,
                         const operators::ActivationParam act_param,
                         ARMContext *ctx);
// for kA53
void sgemm_prepacked_6x8_a53(bool is_transB,
                             int M,
                             int N,
                             int K,
                             const float *A_packed,
                             const float *B,
                             int ldb,
                             float *C,
                             int ldc,
                             const float *bias,
                             bool has_bias,
                             int is_relu,
                             ARMContext *ctx);
#endif  // __aarch64__

/**
 * \brief input data is not transpose
 * for arm-v7a, transform data to block x k x 6 layout
 * for arm-v8a, transform data to block x k x 8 layout
 */
void prepackA(float *out,
              const float *in,
              float alpha,
              int ldin,
              int m0,
              int mmax,
              int k0,
              int kmax,
              bool is_trans,
              ARMContext *ctx) {
#ifdef __aarch64__
  if (mmax <= 4) {
    if (is_trans) {
      prepackA_trans_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
    } else {
      prepackA_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
    }
  } else {
    if (is_trans) {
      prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
    } else {
      prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
    }
  }
#else
  if (ctx->arch() == kA73 || mmax <= 4) {
    if (is_trans) {
      prepackA_trans_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
    } else {
      prepackA_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
    }
  } else {
    if (is_trans) {
      prepackA_trans_6x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
    } else {
      prepackA_6x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
    }
  }
#endif
}

void prepackA(TensorLite *tout,
              const TensorLite &tin,
              float alpha,
              int m,
              int k,
              int group,
              bool is_trans,
              ARMContext *ctx) {
  int hblock = get_hblock(ctx);
  int m_roundup = hblock * ((m + hblock - 1) / hblock);
  int group_size_round_up = ((m_roundup * k + 15) / 16) * 16;
  if (tout->numel() < group_size_round_up * group) {
    tout->Resize({group_size_round_up * group});
  }
  int lda = k;
  if (is_trans) {
    lda = m;
  }
  for (int g = 0; g < group; ++g) {
    const float *weights_group = tin.data<float>() + g * m * k;
    float *weights_trans_ptr =
        tout->mutable_data<float>() + g * group_size_round_up;
    prepackA(weights_trans_ptr,
             weights_group,
             alpha,
             lda,
             0,
             m,
             0,
             k,
             is_trans,
             ctx);
  }
}

/// a: m*k  b: k*n  c: m*n
void sgemm_prepack(bool is_transB,
                   int M,
                   int N,
                   int K,
                   const float *A_packed,
                   const float *B,
                   int ldb,
                   float beta,
                   float *C,
                   int ldc,
                   const float *bias,
                   bool has_bias,
                   const operators::ActivationParam act_param,
                   ARMContext *ctx) {
#ifdef __aarch64__
  if (M <= 4) {
    sgemm_prepacked_4x8(is_transB,
                        M,
                        N,
                        K,
                        A_packed,
                        B,
                        ldb,
                        beta,
                        C,
                        ldc,
                        bias,
                        has_bias,
                        act_param,
                        ctx);
  } else {
    sgemm_prepacked_8x12(is_transB,
                         M,
                         N,
                         K,
                         A_packed,
                         B,
                         ldb,
                         beta,
                         C,
                         ldc,
                         bias,
                         has_bias,
                         act_param,
                         ctx);
  }
#else   // armv7
  if (ctx->arch() == kA73 || M <= 4) {
    sgemm_prepacked_4x8(is_transB,
                        M,
                        N,
                        K,
                        A_packed,
                        B,
                        ldb,
                        beta,
                        C,
                        ldc,
                        bias,
                        has_bias,
                        act_param,
                        ctx);
  } else if (ctx->arch() == kA53) {
    auto act_type = act_param.active_type;
    bool has_act = act_param.has_active;
    bool act_flag =
        (has_act == false) ||
        (has_act == true && act_type == lite_api::ActivationType::kRelu);
    bool has_beta = fabsf(beta) > 1e-8f ? true : false;
    bool a53_sgemm = act_flag && !has_beta && ctx->has_a53_valid();
    if (a53_sgemm) {
      sgemm_prepacked_6x8_a53(is_transB,
                              M,
                              N,
                              K,
                              A_packed,
                              B,
                              ldb,
                              C,
                              ldc,
                              bias,
                              has_bias,
                              static_cast<int>(has_act),
                              ctx);
    } else {
      sgemm_prepacked_6x8(is_transB,
                          M,
                          N,
                          K,
                          A_packed,
                          B,
                          ldb,
                          beta,
                          C,
                          ldc,
                          bias,
                          has_bias,
                          act_param,
                          ctx);
    }
  } else {
    sgemm_prepacked_6x8(is_transB,
                        M,
                        N,
                        K,
                        A_packed,
                        B,
                        ldb,
                        beta,
                        C,
                        ldc,
                        bias,
                        has_bias,
                        act_param,
                        ctx);
  }
#endif  // arm64
}

#ifdef __aarch64__
/*
 * The following function prepackA_8x12 is base on
 * https://github.com/ARM-software/ComputeLibrary/
 *
 * Copyright (c) 2017-2019 ARM Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
void prepackA_8x12(float *dout,
                   const float *inptr,
                   float alpha,
                   int ldin,
                   int m0,
                   int mmax,
                   int k0,
                   int kmax) {
  int x_len = kmax - k0;
  int stride = x_len * 8;
  float zerobuff[x_len];  // NOLINT
  memset(zerobuff, 0, sizeof(float) * x_len);
  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;

#pragma omp parallel for
  for (int y = m0; y < mmax; y += 8) {
    float *outptr = dout + stride * (y - m0) / 8;

    const float *inptr0 = inptr + y * ldin + k0;
    const float *inptr1 = inptr0 + ldin;
    const float *inptr2 = inptr1 + ldin;
    const float *inptr3 = inptr2 + ldin;
    const float *inptr4 = inptr3 + ldin;
    const float *inptr5 = inptr4 + ldin;
    const float *inptr6 = inptr5 + ldin;
    const float *inptr7 = inptr6 + ldin;

    asm volatile(
        "prfm   pldl1keep, [%[ptr0]]        \n"
        "prfm   pldl1keep, [%[ptr0], #64]   \n"
        "prfm   pldl1keep, [%[ptr1]]        \n"
        "prfm   pldl1keep, [%[ptr1], #64]   \n"
        "prfm   pldl1keep, [%[ptr2]]        \n"
        "prfm   pldl1keep, [%[ptr2], #64]   \n"
        "prfm   pldl1keep, [%[ptr3]]        \n"
        "prfm   pldl1keep, [%[ptr3], #64]   \n"
        "prfm   pldl1keep, [%[ptr4]]        \n"
        "prfm   pldl1keep, [%[ptr4], #64]   \n"
        "prfm   pldl1keep, [%[ptr5]]        \n"
        "prfm   pldl1keep, [%[ptr5], #64]   \n"
        "prfm   pldl1keep, [%[ptr6]]        \n"
        "prfm   pldl1keep, [%[ptr6], #64]   \n"
        "prfm   pldl1keep, [%[ptr7]]        \n"
        "prfm   pldl1keep, [%[ptr7], #64]   \n"
        :
        : [ptr0] "r"(inptr0),
          [ptr1] "r"(inptr1),
          [ptr2] "r"(inptr2),
          [ptr3] "r"(inptr3),
          [ptr4] "r"(inptr4),
          [ptr5] "r"(inptr5),
          [ptr6] "r"(inptr6),
          [ptr7] "r"(inptr7)
        : "memory");

    int x = x_len;
    //! cope with row index exceed real size, set to zero buffer
    if ((y + 7) >= mmax) {
      switch ((y + 7) - mmax) {
        case 6:
          inptr1 = zerobuff;
        case 5:
          inptr2 = zerobuff;
        case 4:
          inptr3 = zerobuff;
        case 3:
          inptr4 = zerobuff;
        case 2:
          inptr5 = zerobuff;
        case 1:
          inptr6 = zerobuff;
        case 0:
          inptr7 = zerobuff;
        default:
          break;
      }
    }
    for (; x > 7; x -= 8) {
      asm volatile(
          "cbz    %w[has_alpha], 0f\n"            /* check alpha == 1.f? */
          "dup    v31.4s, %w[alpha]\n"            /* alpha to vector */
          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
          "fmul   v0.4s,  v31.4s, v0.4s\n"        /* mul alpha */
          "fmul   v1.4s,  v31.4s, v1.4s\n"        /* mul alpha */
          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
          "fmul   v2.4s,  v31.4s, v2.4s\n"        /* mul alpha */
          "fmul   v3.4s,  v31.4s, v3.4s\n"        /* mul alpha */
          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
          "fmul   v4.4s,  v31.4s, v4.4s\n"        /* mul alpha */
          "fmul   v5.4s,  v31.4s, v5.4s\n"        /* mul alpha */
          "ldp    q8, q9,     [%[inptr4]], #32\n" /* load r4, e0~e7 */
          "fmul   v6.4s,  v31.4s, v6.4s\n"        /* mul alpha */
          "fmul   v7.4s,  v31.4s, v7.4s\n"        /* mul alpha */
          "ldp    q10, q11,   [%[inptr5]], #32\n" /* load r5, f0~f7 */
          "fmul   v8.4s,  v31.4s, v8.4s\n"        /* mul alpha */
          "fmul   v9.4s,  v31.4s, v9.4s\n"        /* mul alpha */
          "ldp    q12, q13,   [%[inptr6]], #32\n" /* load r6, g0~g7 */
          "fmul   v10.4s,  v31.4s, v10.4s\n"      /* mul alpha */
          "fmul   v11.4s,  v31.4s, v11.4s\n"      /* mul alpha */
          "ldp    q14, q15,   [%[inptr7]], #32\n" /* load r7, h0~h7 */
          "fmul   v12.4s,  v31.4s, v12.4s\n"      /* mul alpha */
          "fmul   v13.4s,  v31.4s, v13.4s\n"      /* mul alpha */
          "fmul   v14.4s,  v31.4s, v14.4s\n"      /* mul alpha */
          "fmul   v15.4s,  v31.4s, v15.4s\n"      /* mul alpha */
          "b 1f\n"                                /* to main process */
          "0: \n"                                 /* alpha == 1 */
          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
          "ldp    q8, q9,     [%[inptr4]], #32\n" /* load r4, e0~e7 */
          "ldp    q10, q11,   [%[inptr5]], #32\n" /* load r5, f0~f7 */
          "ldp    q12, q13,   [%[inptr6]], #32\n" /* load r6, g0~g7 */
          "ldp    q14, q15,   [%[inptr7]], #32\n" /* load r7, h0~h7 */
          "1: \n"                                 /* main process */
          "trn1   v16.4s, v0.4s, v2.4s\n"         /* a0b0a2b2*/
          "trn2   v17.4s, v0.4s, v2.4s\n"         /* a1b1a3b3*/
          "trn1   v18.4s, v1.4s, v3.4s\n"         /* a4b4a6b6*/
          "trn2   v19.4s, v1.4s, v3.4s\n"         /* a5b5a7b7*/

          "trn1   v20.4s, v4.4s, v6.4s\n" /* c0d0c2d2*/
          "trn2   v21.4s, v4.4s, v6.4s\n" /* c1d1c3d3*/
          "trn1   v22.4s, v5.4s, v7.4s\n" /* c4d4c6d6*/
          "trn2   v23.4s, v5.4s, v7.4s\n" /* c5d5c7d7*/

          "trn1   v24.4s, v8.4s, v10.4s\n" /* e0f0e2f2*/
          "trn2   v25.4s, v8.4s, v10.4s\n" /* e1f1e3f3*/
          "trn1   v26.4s, v9.4s, v11.4s\n" /* e4f4e6f6*/
          "trn2   v27.4s, v9.4s, v11.4s\n" /* e5f5e7f7*/

          "trn1   v28.4s, v12.4s, v14.4s\n" /* g0h0g2h2*/
          "trn2   v29.4s, v12.4s, v14.4s\n" /* g1h1g3h3*/
          "trn1   v30.4s, v13.4s, v15.4s\n" /* g4h4g6h6*/
          "trn2   v31.4s, v13.4s, v15.4s\n" /* g5h5g7h7*/

          "trn1   v0.2d, v16.2d, v20.2d\n" /* a0b0c0d0 */
          "trn1   v1.2d, v24.2d, v28.2d\n" /* e0f0g0h0 */
          "trn1   v2.2d, v17.2d, v21.2d\n" /* a1b1c1d1 */
          "trn1   v3.2d, v25.2d, v29.2d\n" /* e1b1c1d1 */

          "trn2   v4.2d, v16.2d, v20.2d\n"    /* a2b2c2d2 */
          "trn2   v5.2d, v24.2d, v28.2d\n"    /* e2f2g2h2 */
          "stp    q0, q1, [%[outptr]], #32\n" /* save q0, q1, a0~h0*/
          "trn2   v6.2d, v17.2d, v21.2d\n"    /* a3b3c3d3 */
          "trn2   v7.2d, v25.2d, v29.2d\n"    /* e3f3g3h3 */
          "stp    q2, q3, [%[outptr]], #32\n" /* save q2, q3, a1~h1*/

          "trn1   v8.2d, v18.2d, v22.2d\n"    /* a4b4c4d4 */
          "trn1   v9.2d, v26.2d, v30.2d\n"    /* e4f4g4h4 */
          "stp    q4, q5, [%[outptr]], #32\n" /* save q4, q5, a2~h2*/
          "trn1   v10.2d, v19.2d, v23.2d\n"   /* a5b5c5d5 */
          "trn1   v11.2d, v27.2d, v31.2d\n"   /* e5f5g5h5 */
          "stp    q6, q7, [%[outptr]], #32\n" /* save q6, q7, a3~h3*/

          "trn2   v12.2d, v18.2d, v22.2d\n"     /* a6b6c6d6 */
          "trn2   v13.2d, v26.2d, v30.2d\n"     /* e6f6g6h6 */
          "stp    q8, q9, [%[outptr]], #32\n"   /* save q8, q9, a4~h4*/
          "trn2   v14.2d, v19.2d, v23.2d\n"     /* a7b7c7d7 */
          "trn2   v15.2d, v27.2d, v31.2d\n"     /* e7f7g7h7 */
          "stp    q10, q11, [%[outptr]], #32\n" /* save q10, q11, a5~h5*/

          "stp    q12, q13, [%[outptr]], #32\n" /* save q12, q13, a6~h6*/
          "stp    q14, q15, [%[outptr]], #32\n" /* save q14, q15, a7~h7*/
          : [inptr0] "+r"(inptr0),
            [inptr1] "+r"(inptr1),
            [inptr2] "+r"(inptr2),
            [inptr3] "+r"(inptr3),
            [inptr4] "+r"(inptr4),
            [inptr5] "+r"(inptr5),
            [inptr6] "+r"(inptr6),
            [inptr7] "+r"(inptr7),
            [outptr] "+r"(outptr)
          : [alpha] "r"(alpha), [has_alpha] "r"(has_alpha)
          : "v0",
            "v1",
            "v2",
            "v3",
            "v4",
            "v5",
            "v6",
            "v7",
            "v8",
            "v9",
            "v10",
            "v11",
            "v12",
            "v13",
            "v14",
            "v15",
            "v16",
            "v17",
            "v18",
            "v19",
            "v20",
            "v21",
            "v22",
            "v23",
            "v24",
            "v25",
            "v26",
            "v27",
            "v28",
            "v29",
            "v30",
            "v31",
            "cc",
            "memory");
    }

    for (; x > 0; x--) {
      if (has_alpha) {
        *outptr++ = *inptr0++ * alpha;
        *outptr++ = *inptr1++ * alpha;
        *outptr++ = *inptr2++ * alpha;
        *outptr++ = *inptr3++ * alpha;
        *outptr++ = *inptr4++ * alpha;
        *outptr++ = *inptr5++ * alpha;
        *outptr++ = *inptr6++ * alpha;
        *outptr++ = *inptr7++ * alpha;
      } else {
        *outptr++ = *inptr0++;
        *outptr++ = *inptr1++;
        *outptr++ = *inptr2++;
        *outptr++ = *inptr3++;
        *outptr++ = *inptr4++;
        *outptr++ = *inptr5++;
        *outptr++ = *inptr6++;
        *outptr++ = *inptr7++;
      }
    }
  }
}

void prepackA_4x8(float *outptr,
                  const float *inptr,
                  float alpha,
                  int ldin,
                  int m0,
                  int mmax,
                  int k0,
                  int kmax) {
  int x_len = kmax - k0;
  float zerobuff[x_len];  // NOLINT
  memset(zerobuff, 0, sizeof(float) * x_len);

  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
  float32x4_t valpha = vdupq_n_f32(alpha);
#pragma omp parallel for
  for (int y = m0; y < mmax; y += 4) {
    const float *inptr0 = inptr + y * ldin + k0;
    const float *inptr1 = inptr0 + ldin;
    const float *inptr2 = inptr1 + ldin;
    const float *inptr3 = inptr2 + ldin;

    asm volatile(
        "prfm   pldl1keep, [%[ptr0]]        \n"
        "prfm   pldl1keep, [%[ptr0], #64]   \n"
        "prfm   pldl1keep, [%[ptr1]]        \n"
        "prfm   pldl1keep, [%[ptr1], #64]   \n"
        "prfm   pldl1keep, [%[ptr2]]        \n"
        "prfm   pldl1keep, [%[ptr2], #64]   \n"
        "prfm   pldl1keep, [%[ptr3]]        \n"
        "prfm   pldl1keep, [%[ptr3], #64]   \n"
        :
        : [ptr0] "r"(inptr0),
          [ptr1] "r"(inptr1),
          [ptr2] "r"(inptr2),
          [ptr3] "r"(inptr3)
        : "memory");
    int x = x_len;
    if ((y + 3) >= mmax) {
      switch ((y + 3) - mmax) {
        case 2:
          inptr1 = zerobuff;
        case 1:
          inptr2 = zerobuff;
        case 0:
          inptr3 = zerobuff;
        default:
          break;
      }
    }

    for (; x > 7; x -= 8) {
      // clang-format off
      asm volatile(
          "cbz %w[has_alpha], 0f\n"
          "ldp q0, q1, [%[inptr0]], #32\n"    // load r0, a0~a7
          "ldp q2, q3, [%[inptr1]], #32\n"    // load r1, b0~b7
          "fmul v0.4s, v0.4s, %[alpha].4s\n"
          "fmul v1.4s, v1.4s, %[alpha].4s\n"
          "ldp  q4, q5, [%[inptr2]], #32\n"   // load r2, c0~c7
          "fmul v2.4s, v2.4s, %[alpha].4s\n"
          "fmul v3.4s, v3.4s, %[alpha].4s\n"
          "ldp  q6, q7, [%[inptr3]], #32\n"   // load r3, d0~d7
          "fmul v4.4s, v4.4s, %[alpha].4s\n"
          "fmul v5.4s, v5.4s, %[alpha].4s\n"
          "fmul v6.4s, v6.4s, %[alpha].4s\n"
          "fmul v7.4s, v7.4s, %[alpha].4s\n"
          "b 1f\n"                            // to main process
          "0: \n"                             // alpha == 1
          "ldp  q0, q1, [%[inptr0]], #32\n"   // load r0, a0~a7
          "ldp  q2, q3, [%[inptr1]], #32\n"   // load r1, b0~b7
          "ldp  q4, q5, [%[inptr2]], #32\n"   // load r2, c0~c7
          "ldp  q6, q7, [%[inptr3]], #32\n"   // load r3, d0~d7
          "1: \n"
          "trn1 v8.4s, v0.4s, v2.4s\n"        // a0b0a2b2
          "trn2 v9.4s, v0.4s, v2.4s\n"        // a1b1a3b3
          "trn1 v10.4s, v1.4s, v3.4s\n"       // a4b4a6b6
          "trn2 v11.4s, v1.4s, v3.4s\n"       // a5b5a7b7
          "trn1 v12.4s, v4.4s, v6.4s\n"       // c0d0c2d2
          "trn2 v13.4s, v4.4s, v6.4s\n"       // c1d1c3d3
          "trn1 v14.4s, v5.4s, v7.4s\n"       // c4d4c6d6
          "trn2 v15.4s, v5.4s, v7.4s\n"       // c5d5c7d7

          "trn1 v0.2d, v8.2d, v12.2d\n"       // a0b0c0d0
          "trn1 v1.2d, v9.2d, v13.2d\n"       // a1b1c1d1
          "trn2 v2.2d, v8.2d, v12.2d\n"       // a2b2c2d2
          "trn2 v3.2d, v9.2d, v13.2d\n"       // a3b3c3d3
          "st1 {v0.4s}, [%[outptr]], #16\n"
          "trn1 v4.2d, v10.2d, v14.2d\n"      // a4b4c4d4
          "st1 {v1.4s}, [%[outptr]], #16\n"
          "trn1 v5.2d, v11.2d, v15.2d\n"      // a5b5c5d5
          "st1 {v2.4s}, [%[outptr]], #16\n"
          "trn2 v6.2d, v10.2d, v14.2d\n"      // a6b6c6d6
          "st1 {v3.4s}, [%[outptr]], #16\n"
          "trn2 v7.2d, v11.2d, v15.2d\n"      // a7b7c7d7
          "stp q4, q5, [%[outptr]], #32\n"
          "stp q6, q7, [%[outptr]], #32\n"
          : [inptr0] "+r"(inptr0),
            [inptr1] "+r"(inptr1),
            [inptr2] "+r"(inptr2),
            [inptr3] "+r"(inptr3),
            [outptr] "+r"(outptr)
          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
          : "cc",
            "memory",
            "v0",
            "v1",
            "v2",
            "v3",
            "v4",
            "v5",
            "v6",
            "v7",
            "v8",
            "v9",
            "v10",
            "v11",
            "v12",
            "v13",
            "v14",
            "v15");
      // clang-format on
    }

    for (; x > 0; x--) {
      if (has_alpha) {
        *outptr++ = *inptr0++ * alpha;
        *outptr++ = *inptr1++ * alpha;
        *outptr++ = *inptr2++ * alpha;
        *outptr++ = *inptr3++ * alpha;
      } else {
        *outptr++ = *inptr0++;
        *outptr++ = *inptr1++;
        *outptr++ = *inptr2++;
        *outptr++ = *inptr3++;
      }
    }
  }
}
void pack_m4(float *dout,
             const float *inptr,
             float alpha,
             int ldin,
             int m0,
             int mmax,
             int k0,
             int kmax) {
  int x_len = kmax - k0;
  int stride = x_len * 4;
  float zerobuff[x_len];  // NOLINT
  memset(zerobuff, 0, sizeof(float) * x_len);
  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;

#pragma omp parallel for
  for (int y = m0; y < mmax; y += 4) {
    float *outptr = dout + stride * (y - m0) / 4;

    const float *inptr0 = inptr + y * ldin + k0;
    const float *inptr1 = inptr0 + ldin;
    const float *inptr2 = inptr1 + ldin;
    const float *inptr3 = inptr2 + ldin;

    asm volatile(
        "prfm   pldl1keep, [%[ptr0]]        \n"
        "prfm   pldl1keep, [%[ptr0], #64]   \n"
        "prfm   pldl1keep, [%[ptr1]]        \n"
        "prfm   pldl1keep, [%[ptr1], #64]   \n"
        "prfm   pldl1keep, [%[ptr2]]        \n"
        "prfm   pldl1keep, [%[ptr2], #64]   \n"
        "prfm   pldl1keep, [%[ptr3]]        \n"
        "prfm   pldl1keep, [%[ptr3], #64]   \n"
        :
        : [ptr0] "r"(inptr0),
          [ptr1] "r"(inptr1),
          [ptr2] "r"(inptr2),
          [ptr3] "r"(inptr3)
        : "memory");

    int x = x_len;
    //! cope with row index exceed real size, set to zero buffer
    if ((y + 3) >= mmax) {
      switch ((y + 3) - mmax) {
        case 2:
          inptr1 = zerobuff;
        case 1:
          inptr2 = zerobuff;
        case 0:
          inptr3 = zerobuff;
        default:
          break;
      }
    }
    for (; x > 7; x -= 8) {
      asm volatile(
          "cbz    %w[has_alpha], 0f\n"            /* check alpha == 1.f? */
          "dup    v31.4s, %w[alpha]\n"            /* alpha to vector */
          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
          "fmul   v0.4s,  v31.4s, v0.4s\n"        /* mul alpha */
          "fmul   v1.4s,  v31.4s, v1.4s\n"        /* mul alpha */
          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
          "fmul   v2.4s,  v31.4s, v2.4s\n"        /* mul alpha */
          "fmul   v3.4s,  v31.4s, v3.4s\n"        /* mul alpha */
          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
          "fmul   v4.4s,  v31.4s, v4.4s\n"        /* mul alpha */
          "fmul   v5.4s,  v31.4s, v5.4s\n"        /* mul alpha */
          "fmul   v6.4s,  v31.4s, v6.4s\n"        /* mul alpha */
          "fmul   v7.4s,  v31.4s, v7.4s\n"        /* mul alpha */
          "b 1f\n"                                /* to main process */
          "0: \n"                                 /* alpha == 1 */
          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
          "1: \n"                                 /* main process */
          "trn1   v8.4s, v0.4s, v2.4s\n"          /* a0b0a2b2*/
          "trn2   v9.4s, v0.4s, v2.4s\n"          /* a1b1a3b3*/
          "trn1   v10.4s, v1.4s, v3.4s\n"         /* a4b4a6b6*/
          "trn2   v11.4s, v1.4s, v3.4s\n"         /* a5b5a7b7*/

          "trn1   v12.4s, v4.4s, v6.4s\n" /* c0d0c2d2*/
          "trn2   v13.4s, v4.4s, v6.4s\n" /* c1d1c3d3*/
          "trn1   v14.4s, v5.4s, v7.4s\n" /* c4d4c6d6*/
          "trn2   v15.4s, v5.4s, v7.4s\n" /* c5d5c7d7*/

          "trn1   v0.2d, v8.2d, v12.2d\n"  /* a0b0c0d0 */
          "trn1   v1.2d, v9.2d, v13.2d\n"  /* a1b1c1d1 */
          "trn1   v2.2d, v10.2d, v14.2d\n" /* a4b4c4d4 */
          "trn1   v3.2d, v11.2d, v15.2d\n" /* a5b5c5d5 */

          "trn2   v4.2d, v8.2d, v12.2d\n"     /* a2b2c2d2 */
          "trn2   v5.2d, v9.2d, v13.2d\n"     /* a3b3c3d3 */
          "stp    q0, q1, [%[outptr]], #32\n" /* save q0, q1, a0~h0*/
          "trn2   v6.2d, v10.2d, v14.2d\n"    /* a6b6c6d6 */
          "trn2   v7.2d, v11.2d, v15.2d\n"    /* a7b7c7d7 */
          "stp    q4, q5, [%[outptr]], #32\n" /* save q2, q3, a1~h1*/
          "stp    q2, q3, [%[outptr]], #32\n" /* save q4, q5, a2~h2*/
          "stp    q6, q7, [%[outptr]], #32\n" /* save q6, q7, a3~h3*/

          : [inptr0] "+r"(inptr0),
            [inptr1] "+r"(inptr1),
            [inptr2] "+r"(inptr2),
            [inptr3] "+r"(inptr3),
            [outptr] "+r"(outptr)
          : [alpha] "r"(alpha), [has_alpha] "r"(has_alpha)
          : "v0",
            "v1",
            "v2",
            "v3",
            "v4",
            "v5",
            "v6",
            "v7",
            "v8",
            "v9",
            "v10",
            "v11",
            "v12",
            "v13",
            "v14",
            "v15",
            "cc",
            "memory");
    }

    for (; x > 0; x--) {
      if (has_alpha) {
        *outptr++ = *inptr0++ * alpha;
        *outptr++ = *inptr1++ * alpha;
        *outptr++ = *inptr2++ * alpha;
        *outptr++ = *inptr3++ * alpha;
      } else {
        *outptr++ = *inptr0++;
        *outptr++ = *inptr1++;
        *outptr++ = *inptr2++;
        *outptr++ = *inptr3++;
      }
    }
  }
}

void prepackA_trans_8x12(float *outptr,
                         const float *in,
                         float alpha,
                         int ldin,
                         int m0,
                         int mmax,
                         int k0,
                         int kmax) {
  auto inptr = in + k0 * ldin + m0;
  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  int x_len = mmax - m0;
  int y_len = kmax - k0;
  int right_remain = x_len - 8 * (x_len / 8);
  int stride_out = 8 * y_len;

  float32x4_t vzero = vdupq_n_f32(0.f);
  uint32x4_t vmask1 =
      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
  uint32x4_t vmask2 =
      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));

  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
  float32x4_t valpha = vdupq_n_f32(alpha);

#pragma omp parallel for
  for (int y = 0; y < y_len - 3; y += 4) {
    const float *ptr0 = inptr + y * ldin;
    const float *ptr1 = ptr0 + ldin;
    const float *ptr2 = ptr1 + ldin;
    const float *ptr3 = ptr2 + ldin;

    asm volatile(
        "prfm   pldl1keep, [%[ptr0]]        \n"
        "prfm   pldl1keep, [%[ptr0], #64]   \n"
        "prfm   pldl1keep, [%[ptr1]]        \n"
        "prfm   pldl1keep, [%[ptr1], #64]   \n"
        "prfm   pldl1keep, [%[ptr2]]        \n"
        "prfm   pldl1keep, [%[ptr2], #64]   \n"
        "prfm   pldl1keep, [%[ptr3]]        \n"
        "prfm   pldl1keep, [%[ptr3], #64]   \n"
        :
        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
        : "memory");

    float *outptr_row_col = outptr + y * 8;
    int i = 0;
    for (; i < x_len - 7; i += 8) {
      float32x4_t vr00 = vld1q_f32(ptr0);
      float32x4_t vr01 = vld1q_f32(ptr0 + 4);
      float32x4_t vr10 = vld1q_f32(ptr1);
      float32x4_t vr11 = vld1q_f32(ptr1 + 4);
      float32x4_t vr20 = vld1q_f32(ptr2);
      float32x4_t vr21 = vld1q_f32(ptr2 + 4);
      float32x4_t vr30 = vld1q_f32(ptr3);
      float32x4_t vr31 = vld1q_f32(ptr3 + 4);
      if (has_alpha) {
        vr00 = vmulq_f32(vr00, valpha);
        vr01 = vmulq_f32(vr01, valpha);
        vr10 = vmulq_f32(vr10, valpha);
        vr11 = vmulq_f32(vr11, valpha);
        vr20 = vmulq_f32(vr20, valpha);
        vr21 = vmulq_f32(vr21, valpha);
        vr30 = vmulq_f32(vr30, valpha);
        vr31 = vmulq_f32(vr31, valpha);
      }

      vst1q_f32(outptr_row_col, vr00);
      vst1q_f32(outptr_row_col + 4, vr01);
      vst1q_f32(outptr_row_col + 8, vr10);
      vst1q_f32(outptr_row_col + 12, vr11);
      vst1q_f32(outptr_row_col + 16, vr20);
      vst1q_f32(outptr_row_col + 20, vr21);
      vst1q_f32(outptr_row_col + 24, vr30);
      vst1q_f32(outptr_row_col + 28, vr31);

      ptr0 += 8;
      ptr1 += 8;
      ptr2 += 8;
      ptr3 += 8;

      outptr_row_col += stride_out;
    }
    if (right_remain > 0) {
      float32x4_t vr00 = vld1q_f32(ptr0);
      float32x4_t vr01 = vld1q_f32(ptr0 + 4);
      float32x4_t vr10 = vld1q_f32(ptr1);
      float32x4_t vr11 = vld1q_f32(ptr1 + 4);
      float32x4_t vr20 = vld1q_f32(ptr2);
      float32x4_t vr21 = vld1q_f32(ptr2 + 4);
      float32x4_t vr30 = vld1q_f32(ptr3);
      float32x4_t vr31 = vld1q_f32(ptr3 + 4);

      if (has_alpha) {
        vr00 = vmulq_f32(vr00, valpha);
        vr01 = vmulq_f32(vr01, valpha);
        vr10 = vmulq_f32(vr10, valpha);
        vr11 = vmulq_f32(vr11, valpha);
        vr20 = vmulq_f32(vr20, valpha);
        vr21 = vmulq_f32(vr21, valpha);
        vr30 = vmulq_f32(vr30, valpha);
        vr31 = vmulq_f32(vr31, valpha);
      }

      float32x4_t vr00_1 = vbslq_f32(vmask1, vr00, vzero);
      float32x4_t vr01_1 = vbslq_f32(vmask2, vr01, vzero);
      float32x4_t vr10_1 = vbslq_f32(vmask1, vr10, vzero);
      float32x4_t vr11_1 = vbslq_f32(vmask2, vr11, vzero);
      float32x4_t vr20_1 = vbslq_f32(vmask1, vr20, vzero);
      float32x4_t vr21_1 = vbslq_f32(vmask2, vr21, vzero);
      float32x4_t vr30_1 = vbslq_f32(vmask1, vr30, vzero);
      float32x4_t vr31_1 = vbslq_f32(vmask2, vr31, vzero);

      vst1q_f32(outptr_row_col, vr00_1);
      vst1q_f32(outptr_row_col + 4, vr01_1);
      vst1q_f32(outptr_row_col + 8, vr10_1);
      vst1q_f32(outptr_row_col + 12, vr11_1);
      vst1q_f32(outptr_row_col + 16, vr20_1);
      vst1q_f32(outptr_row_col + 20, vr21_1);
      vst1q_f32(outptr_row_col + 24, vr30_1);
      vst1q_f32(outptr_row_col + 28, vr31_1);
    }
  }

#pragma omp parallel for
  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
    const float *ptr0 = inptr + y * ldin;
    float *outptr_row_col = outptr + y * 8;
    int i = 0;
    for (; i < x_len - 7; i += 8) {
      float32x4_t vr0 = vld1q_f32(ptr0);
      float32x4_t vr1 = vld1q_f32(ptr0 + 4);
      if (has_alpha) {
        vr0 = vmulq_f32(vr0, valpha);
        vr1 = vmulq_f32(vr1, valpha);
      }
      vst1q_f32(outptr_row_col, vr0);
      vst1q_f32(outptr_row_col + 4, vr1);

      ptr0 += 8;

      outptr_row_col += stride_out;
    }
    if (right_remain > 0) {
      float32x4_t vr0 = vld1q_f32(ptr0);
      float32x4_t vr1 = vld1q_f32(ptr0 + 4);

      if (has_alpha) {
        vr0 = vmulq_f32(vr0, valpha);
        vr1 = vmulq_f32(vr1, valpha);
      }

      float32x4_t vr0_1 = vbslq_f32(vmask1, vr0, vzero);
      float32x4_t vr1_1 = vbslq_f32(vmask2, vr1, vzero);

      vst1q_f32(outptr_row_col, vr0_1);
      vst1q_f32(outptr_row_col + 4, vr1_1);
    }
  }
}

void prepackA_trans_4x8(float *outptr,
                        const float *in,
                        float alpha,
                        int ldin,
                        int m0,
                        int mmax,
                        int k0,
                        int kmax) {
  auto inptr = in + k0 * ldin + m0;
  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
  float32x4_t valpha = vdupq_n_f32(alpha);

  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  int x_len = mmax - m0;
  int y_len = kmax - k0;
  int right_remain = x_len - 4 * (x_len / 4);
  int right_pad = 4 - right_remain;
  if (right_remain == 0) {
    right_pad = 0;
  }

  int stride_out = 4 * y_len;

  float32x4_t vzero = vdupq_n_f32(0.f);
  uint32x4_t vmask1 =
      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));

#pragma omp parallel for
  for (int y = 0; y < y_len - 3; y += 4) {
    const float *ptr0 = inptr + y * ldin;
    const float *ptr1 = ptr0 + ldin;
    const float *ptr2 = ptr1 + ldin;
    const float *ptr3 = ptr2 + ldin;

    asm volatile(
        "prfm   pldl1keep, [%[ptr0]]        \n"
        "prfm   pldl1keep, [%[ptr0], #64]   \n"
        "prfm   pldl1keep, [%[ptr1]]        \n"
        "prfm   pldl1keep, [%[ptr1], #64]   \n"
        "prfm   pldl1keep, [%[ptr2]]        \n"
        "prfm   pldl1keep, [%[ptr2], #64]   \n"
        "prfm   pldl1keep, [%[ptr3]]        \n"
        "prfm   pldl1keep, [%[ptr3], #64]   \n"
        :
        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
        : "memory");
    float *outptr_row_col = outptr + y * 4;
    int i = 0;
    for (; i < x_len - 3; i += 4) {
      float *ptr_out = outptr_row_col;
      // clang-format off
      asm volatile(
          "cmp %w[has_alpha], #0\n"
          "ld1 {v0.4s}, [%[ptr0]], #16\n"
          "ld1 {v1.4s}, [%[ptr1]], #16\n"
          "ld1 {v2.4s}, [%[ptr2]], #16\n"
          "ld1 {v3.4s}, [%[ptr3]], #16\n"
          "beq  0f\n"
          "1: \n"
          "fmul v0.4s, v0.4s, %[alpha].4s\n"
          "fmul v1.4s, v1.4s, %[alpha].4s\n"
          "fmul v2.4s, v2.4s, %[alpha].4s\n"
          "fmul v3.4s, v3.4s, %[alpha].4s\n"
          "0: \n"
          "stp q0, q1, [%[outptr]], #32\n"
          "stp q2, q3, [%[outptr]], #32\n"
          : [outptr] "+r"(ptr_out),
            [ptr0] "+r"(ptr0),
            [ptr1] "+r"(ptr1),
            [ptr2] "+r"(ptr2),
            [ptr3] "+r"(ptr3)
          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
          : "v0", "v1", "v2", "v3", "cc", "memory");
      // clang-format on
      outptr_row_col += stride_out;
    }
    if (right_pad > 0) {
      float *ptr_out = outptr_row_col;
      // clang-format off
      asm volatile(
          "cmp %w[has_alpha], #0\n"
          "ld1 {v0.4s}, [%[ptr0]], #16\n"
          "ld1 {v1.4s}, [%[ptr1]], #16\n"
          "ld1 {v2.4s}, [%[ptr2]], #16\n"
          "ld1 {v3.4s}, [%[ptr3]], #16\n"
          "beq  0f\n"
          "1: \n"
          "fmul v0.4s, v0.4s, %[alpha].4s\n"
          "fmul v1.4s, v1.4s, %[alpha].4s\n"
          "fmul v2.4s, v2.4s, %[alpha].4s\n"
          "fmul v3.4s, v3.4s, %[alpha].4s\n"
          "0: \n"
          "bif v0.16b, %[vzero].16b, %[vmask1].16b\n"
          "bif v1.16b, %[vzero].16b, %[vmask1].16b\n"
          "bif v2.16b, %[vzero].16b, %[vmask1].16b\n"
          "bif v3.16b, %[vzero].16b, %[vmask1].16b\n"
          "stp q0, q1, [%[outptr]], #32\n"
          "stp q2, q3, [%[outptr]], #32\n"
          : [outptr] "+r"(ptr_out),
            [ptr0] "+r"(ptr0),
            [ptr1] "+r"(ptr1),
            [ptr2] "+r"(ptr2),
            [ptr3] "+r"(ptr3)
          : [vmask1] "w"(vmask1),
            [vzero] "w"(vzero),
            [has_alpha] "r"(has_alpha),
            [alpha] "w"(valpha)
          : "v0", "v1", "v2", "v3", "cc", "memory");
      // clang-format on
    }
  }
#pragma omp parallel for
  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
    const float *ptr0 = inptr + y * ldin;
    float *outptr_row_col = outptr + y * 4;
    int i = 0;
    for (; i < x_len - 3; i += 4) {
      float *ptr_out = outptr_row_col;
      asm volatile(
          "cmp %[has_alpha], #0\n"
          "ld1 {v0.4s}, [%[ptr0]], #16\n"
          "beq  0f\n"
          "1: \n"
          "fmul v0.4s, v0.4s, %[alpha].4s\n"
          "0: \n"
          "st1 {v0.4s}, [%[outptr]], #16\n"
          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
          : "v0", "v1", "cc", "memory");
      outptr_row_col += stride_out;
    }
    if (right_pad > 0) {
      float *ptr_out = outptr_row_col;
      asm volatile(
          "cmp %w[has_alpha], #0\n"
          "ld1 {v0.4s}, [%[ptr0]], #16\n"
          "beq  0f\n"
          "1: \n"
          "fmul v0.4s, v0.4s, %[alpha].4s\n"
          "0: \n"
          "bif v0.16b, %[vzero].16b, %[vmask1].16b\n"
          "st1 {v0.4s}, [%[outptr]], #16\n"
          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
          : [vmask1] "w"(vmask1),
            [vzero] "w"(vzero),
            [has_alpha] "r"(has_alpha),
            [alpha] "w"(valpha)
          : "v0", "v1", "cc", "memory");
    }
  }
}

void pack_trans_m4(float *outptr,
                   const float *in,
                   float alpha,
                   int ldin,
                   int m0,
                   int mmax,
                   int k0,
                   int kmax) {
  auto inptr = in + k0 * ldin + m0;
  uint32_t mask_buffer[4] = {0, 1, 2, 3};
  int x_len = mmax - m0;
  int y_len = kmax - k0;
  int right_remain = x_len - 4 * (x_len / 4);
  int stride_out = 4 * y_len;

  float32x4_t vzero = vdupq_n_f32(0.f);
  uint32x4_t vmask1 =
      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));

  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
  float32x4_t valpha = vdupq_n_f32(alpha);

#pragma omp parallel for
  for (int y = 0; y < y_len - 3; y += 4) {
    const float *ptr0 = inptr + y * ldin;
    const float *ptr1 = ptr0 + ldin;
    const float *ptr2 = ptr1 + ldin;
    const float *ptr3 = ptr2 + ldin;

    asm volatile(
        "prfm   pldl1keep, [%[ptr0]]        \n"
        "prfm   pldl1keep, [%[ptr0], #64]   \n"
        "prfm   pldl1keep, [%[ptr1]]        \n"
        "prfm   pldl1keep, [%[ptr1], #64]   \n"
        "prfm   pldl1keep, [%[ptr2]]        \n"
        "prfm   pldl1keep, [%[ptr2], #64]   \n"
        "prfm   pldl1keep, [%[ptr3]]        \n"
        "prfm   pldl1keep, [%[ptr3], #64]   \n"
        :
        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
        : "memory");

    float *outptr_row_col = outptr + y * 4;
    int i = 0;
    for (; i < x_len - 3; i += 4) {
      float32x4_t vr00 = vld1q_f32(ptr0);
      float32x4_t vr10 = vld1q_f32(ptr1);
      float32x4_t vr20 = vld1q_f32(ptr2);
      float32x4_t vr30 = vld1q_f32(ptr3);
      if (has_alpha) {
        vr00 = vmulq_f32(vr00, valpha);
        vr10 = vmulq_f32(vr10, valpha);
        vr20 = vmulq_f32(vr20, valpha);
        vr30 = vmulq_f32(vr30, valpha);
      }

      vst1q_f32(outptr_row_col, vr00);
      vst1q_f32(outptr_row_col + 4, vr10);
      vst1q_f32(outptr_row_col + 8, vr20);
      vst1q_f32(outptr_row_col + 12, vr30);

      ptr0 += 4;
      ptr1 += 4;
      ptr2 += 4;
      ptr3 += 4;

      outptr_row_col += stride_out;
    }
    if (right_remain > 0) {
      float32x4_t vr00 = vld1q_f32(ptr0);
      float32x4_t vr10 = vld1q_f32(ptr1);
      float32x4_t vr20 = vld1q_f32(ptr2);
      float32x4_t vr30 = vld1q_f32(ptr3);

      if (has_alpha) {
        vr00 = vmulq_f32(vr00, valpha);
        vr10 = vmulq_f32(vr10, valpha);
        vr20 = vmulq_f32(vr20, valpha);
        vr30 = vmulq_f32(vr30, valpha);
      }

      float32x4_t vr00_1 = vbslq_f32(vmask1, vr00, vzero);
      float32x4_t vr10_1 = vbslq_f32(vmask1, vr10, vzero);
      float32x4_t vr20_1 = vbslq_f32(vmask1, vr20, vzero);
      float32x4_t vr30_1 = vbslq_f32(vmask1, vr30, vzero);

      vst1q_f32(outptr_row_col, vr00_1);
      vst1q_f32(outptr_row_col + 4, vr10_1);
      vst1q_f32(outptr_row_col + 8, vr20_1);
      vst1q_f32(outptr_row_col + 12, vr30_1);
    }
  }

#pragma omp parallel for
  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
    const float *ptr0 = inptr + y * ldin;
    float *outptr_row_col = outptr + y * 4;
    int i = 0;
    for (; i < x_len - 3; i += 4) {
      float32x4_t vr0 = vld1q_f32(ptr0);
      if (has_alpha) {
        vr0 = vmulq_f32(vr0, valpha);
      }
      vst1q_f32(outptr_row_col, vr0);

      ptr0 += 4;

      outptr_row_col += stride_out;
    }
    if (right_remain > 0) {
      float32x4_t vr0 = vld1q_f32(ptr0);

      if (has_alpha) {
        vr0 = vmulq_f32(vr0, valpha);
      }

      float32x4_t vr0_1 = vbslq_f32(vmask1, vr0, vzero);

      vst1q_f32(outptr_row_col, vr0_1);
    }
  }
}

#else  // __aarch64__
void prepackA_6x8(float* outptr,
                  const float* inptr,
                  float alpha,
                  int ldin,
                  int m0,
                  int mmax,
                  int k0,
                  int kmax) {
  int x_len = kmax - k0;
  float zerobuff[x_len];  // NOLINT
  memset(zerobuff, 0, sizeof(float) * x_len);

  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
  float32x4_t valpha = vdupq_n_f32(alpha);

  for (int y = m0; y < mmax; y += 6) {
    const float* inptr0 = inptr + y * ldin + k0;
    const float* inptr1 = inptr0 + ldin;
    const float* inptr2 = inptr1 + ldin;
    const float* inptr3 = inptr2 + ldin;
    const float* inptr4 = inptr3 + ldin;
    const float* inptr5 = inptr4 + ldin;

    int x = x_len;
    if ((y + 5) >= mmax) {
      switch ((y + 5) - mmax) {
        case 4:
          inptr1 = zerobuff;
        case 3:
          inptr2 = zerobuff;
        case 2:
          inptr3 = zerobuff;
        case 1:
          inptr4 = zerobuff;
        case 0:
          inptr5 = zerobuff;
        default:
          break;
      }
    }

    for (; x > 7; x -= 8) {
      asm volatile(
          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
          "vld4.32  {d16-d19}, [%[inptr4]]! @ zip load r4, "
          "q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n"
          "vld4.32  {d20-d23}, [%[inptr5]]! @ zip load r5, "
          "q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n"
          "cmp %[has_alpha], #0\n"
          "beq  0f\n"                        /* check whether alpha == 1? */
          "vmul.f32   q0, q0, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   q1, q1, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   q2, q2, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   q3, q3, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   q4, q4, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   q5, q5, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   q6, q6, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   q7, q7, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   q8, q8, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   q9, q9, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   q10, q10, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q11, q11, %q[alpha]\n" /* mul alpha */
          "0: \n"
          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
          "q2=r04,r14,r05,r15\n"
          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
          "q6=r24,r34,r25,r35\n"
          "vtrn.32  q8, q10                 @ trans data: q8=r40,r50,r41,r51; "
          "q10=r44,r54,r45,r55\n"

          "vswp     d1, d8                  @ swap d1, d8, q0=r00,r10,r20,r30; "
          "q4=r01,r11,r21,r31\n"
          "vst1.32  {d0-d1},  [%[outptr]]!  @ write q0:r00,r10,r20,r30\n"
          "vst1.32  {d16},    [%[outptr]]!  @ write d16(q8,low),r40,r50\n"
          "vst1.32  {d8-d9},  [%[outptr]]!  @ write q4:r01,r11,r21,r31\n"
          "vst1.32  {d17},    [%[outptr]]!  @ write d16(q8,high),r41,r51\n"

          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
          "q3=r06,r16,r07,r17\n"
          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
          "q7=r26,r36,r27,r37\n"
          "vtrn.32  q9, q11                 @ trans data: q9=r42,r52,r43,r53; "
          "q11=r46,r56,r47,r57\n"

          "vswp     d3, d10                 @ swap d3, d10, "
          "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n"
          "vst1.32  {d2-d3},  [%[outptr]]!  @ write q1:r02,r12,r22,r32\n"
          "vst1.32  {d18},    [%[outptr]]!  @ write d18(q9,low),r42,r52\n"
          "vst1.32  {d10-d11},[%[outptr]]!  @ write q5:r03,r13,r23,r33\n"
          "vst1.32  {d19},    [%[outptr]]!  @ write d19(q9,high),r43,r53\n"

          "vswp     d5, d12                 @ swap d5, d12,q2=r04,r14,r24,r34; "
          "q6=r05,r15,r25,r35\n"
          "vst1.32  {d4-d5},  [%[outptr]]!  @ write q2:r04,r14,r24,r34\n"
          "vst1.32  {d20},    [%[outptr]]!  @ write d20(q10,low),r44,r54\n"
          "vst1.32  {d12-d13},[%[outptr]]!  @ write q6:r05,r15,r25,r35\n"
          "vst1.32  {d21},    [%[outptr]]!  @ write d21(q10,high),r45,r55\n"

          "vswp     d7, d14                 @ swap d7, d14, "
          "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n"
          "vst1.32  {d6-d7},  [%[outptr]]!  @ write q3:r06,r16,r26,r36\n"
          "vst1.32  {d22},    [%[outptr]]!  @ write d22(q11,low),r46,r56\n"
          "vst1.32  {d14-d15},[%[outptr]]!  @ write q7:r07,r17,r27,r37\n"
          "vst1.32  {d23},    [%[outptr]]!  @ write d23(q11,high),r47,r57\n"
          : [inptr0] "+r"(inptr0),
            [inptr1] "+r"(inptr1),
            [inptr2] "+r"(inptr2),
            [inptr3] "+r"(inptr3),
            [inptr4] "+r"(inptr4),
            [inptr5] "+r"(inptr5),
            [outptr] "+r"(outptr)
          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
          : "q0",
            "q1",
            "q2",
            "q3",
            "q4",
            "q5",
            "q6",
            "q7",
            "q8",
            "q9",
            "q10",
            "q11",
            "q15",
            "cc",
            "memory");
    }

    for (; x > 0; x--) {
      if (has_alpha) {
        *outptr++ = *inptr0++ * alpha;
        *outptr++ = *inptr1++ * alpha;
        *outptr++ = *inptr2++ * alpha;
        *outptr++ = *inptr3++ * alpha;
        *outptr++ = *inptr4++ * alpha;
        *outptr++ = *inptr5++ * alpha;
      } else {
        *outptr++ = *inptr0++;
        *outptr++ = *inptr1++;
        *outptr++ = *inptr2++;
        *outptr++ = *inptr3++;
        *outptr++ = *inptr4++;
        *outptr++ = *inptr5++;
      }
    }
  }
}

void prepackA_trans_6x8(float* outptr,
                        const float* in,
                        float alpha,
                        int ldin,
                        int m0,
                        int mmax,
                        int k0,
                        int kmax) {
  auto inptr = in + k0 * ldin + m0;

  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
  float32x4_t valpha = vdupq_n_f32(alpha);

  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  int x_len = mmax - m0;
  int y_len = kmax - k0;
  int right_remain = x_len - 6 * (x_len / 6);
  int right_pad = 6 - right_remain;
  if (right_remain == 0) {
    right_pad = 0;
  }

  float* outptr_row = outptr;
  int stride_out = 6 * y_len;

  float32x4_t vzero = vdupq_n_f32(0.f);
  uint32x4_t vmask1 =
      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
  uint32x4_t vmask2 =
      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));

#pragma omp parallel for
  for (int y = 0; y < y_len - 3; y += 4) {
    const float* ptr0 = inptr + y * ldin;
    const float* ptr1 = ptr0 + ldin;
    const float* ptr2 = ptr1 + ldin;
    const float* ptr3 = ptr2 + ldin;

    float* outptr_row_col = outptr_row + y * 6;
    int i = 0;
    for (; i < x_len - 5; i += 6) {
      float* ptr_out = outptr_row_col;
      asm volatile(
          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
          "vld1.32 {d4-d6}, [%[ptr1]]!        @ load r1, 6 elements\n"
          "vld1.32 {d8-d10}, [%[ptr2]]!       @ load r2, 6 elements\n"
          "vld1.32 {d12-d14}, [%[ptr3]]!      @ load r3, 6 elements\n"
          "cmp %[has_alpha], #0\n"
          "beq  0f\n"                        /* check whether alpha == 1? */
          "vmul.f32   q0, q0, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   d2, d2, %e[alpha]\n"   /* mul alpha */
          "vmul.f32   q2, q2, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   d6, d6, %e[alpha]\n"   /* mul alpha */
          "vmul.f32   q4, q4, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   d10, d10, %e[alpha]\n" /* mul alpha */
          "vmul.f32   q6, q6, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   d14, d14, %e[alpha]\n" /* mul alpha */
          "0: \n"
          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d4-d6}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d8-d10}, [%[outptr]]!     @ write to output ptr\n"
          "vst1.32 {d12-d14}, [%[outptr]]!    @ write to output ptr\n"
          : [outptr] "+r"(ptr_out),
            [ptr0] "+r"(ptr0),
            [ptr1] "+r"(ptr1),
            [ptr2] "+r"(ptr2),
            [ptr3] "+r"(ptr3)
          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "cc", "memory");
      outptr_row_col += stride_out;
    }
    if (right_pad > 0) {
      float* ptr_out = outptr_row_col;
      asm volatile(
          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
          "vld1.32 {d4-d6}, [%[ptr1]]!        @ load r1, 6 elements\n"
          "vld1.32 {d8-d10}, [%[ptr2]]!       @ load r2, 8 elements\n"
          "vld1.32 {d12-d14}, [%[ptr3]]!      @ load r3, 8 elements\n"
          "cmp %[has_alpha], #0\n"
          "beq  0f\n"                        /* check whether alpha == 1? */
          "vmul.f32   q0, q0, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   d2, d2, %e[alpha]\n"   /* mul alpha */
          "vmul.f32   q2, q2, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   d6, d6, %e[alpha]\n"   /* mul alpha */
          "vmul.f32   q4, q4, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   d10, d10, %e[alpha]\n" /* mul alpha */
          "vmul.f32   q6, q6, %q[alpha]\n"   /* mul alpha */
          "vmul.f32   d14, d14, %e[alpha]\n" /* mul alpha */
          "0: \n"
          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   d2, %e[vzero], %e[vmask2]   @ bit select, pad zero\n"
          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   d6, %e[vzero], %e[vmask2]   @ bit select, pad zero\n"
          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d4-d6}, [%[outptr]]!      @ write to output ptr\n"
          "vbif   q4, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   d10, %e[vzero], %e[vmask2]  @ bit select, pad zero\n"
          "vbif   q6, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   d14, %e[vzero], %e[vmask2]  @ bit select, pad zero\n"
          "vst1.32 {d8-d10}, [%[outptr]]!     @ write to output ptr\n"
          "vst1.32 {d12-d14}, [%[outptr]]!    @ write to output ptr\n"
          : [outptr] "+r"(ptr_out),
            [ptr0] "+r"(ptr0),
            [ptr1] "+r"(ptr1),
            [ptr2] "+r"(ptr2),
            [ptr3] "+r"(ptr3)
          : [vmask1] "w"(vmask1),
            [vmask2] "w"(vmask2),
            [vzero] "w"(vzero),
            [has_alpha] "r"(has_alpha),
            [alpha] "w"(valpha)
          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "cc", "memory");
    }
  }

#pragma omp parallel for
  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
    const float* ptr0 = inptr + y * ldin;
    float* outptr_row_col = outptr_row + y * 6;
    int i = 0;
    for (; i < x_len - 5; i += 6) {
      float* ptr_out = outptr_row_col;
      asm volatile(
          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
          "cmp %[has_alpha], #0\n"
          "beq  0f\n"                      /* check whether alpha == 1? */
          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
          "vmul.f32   d2, d2, %e[alpha]\n" /* mul alpha */
          "0: \n"
          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
          : "q0", "q1", "cc", "memory");
      outptr_row_col += stride_out;
    }
    if (right_pad > 0) {
      float* ptr_out = outptr_row_col;
      asm volatile(
          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
          "cmp %[has_alpha], #0\n"
          "beq  0f\n"                      /* check whether alpha == 1? */
          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
          "vmul.f32   d2, d2, %e[alpha]\n" /* mul alpha */
          "0: \n"
          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   d2, %e[vzero], %e[vmask2]   @ bit select, pad zero\n"
          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
          : [vmask1] "w"(vmask1),
            [vmask2] "w"(vmask2),
            [vzero] "w"(vzero),
            [has_alpha] "r"(has_alpha),
            [alpha] "w"(valpha)
          : "q0", "q1", "cc", "memory");
    }
  }
}

void prepackA_4x8(float* outptr,
                  const float* inptr,
                  float alpha,
                  int ldin,
                  int m0,
                  int mmax,
                  int k0,
                  int kmax) {
  int x_len = kmax - k0;
  float zerobuff[x_len];  // NOLINT
  memset(zerobuff, 0, sizeof(float) * x_len);

  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
  float32x4_t valpha = vdupq_n_f32(alpha);

  for (int y = m0; y < mmax; y += 4) {
    const float* inptr0 = inptr + y * ldin + k0;
    const float* inptr1 = inptr0 + ldin;
    const float* inptr2 = inptr1 + ldin;
    const float* inptr3 = inptr2 + ldin;

    int x = x_len;
    if ((y + 3) >= mmax) {
      switch ((y + 3) - mmax) {
        case 2:
          inptr1 = zerobuff;
        case 1:
          inptr2 = zerobuff;
        case 0:
          inptr3 = zerobuff;
        default:
          break;
      }
    }

    for (; x > 7; x -= 8) {
      asm volatile(
          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
          "cmp %[has_alpha], #0\n"
          "beq  0f\n"                      /* check whether alpha == 1? */
          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q1, q1, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q2, q2, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q3, q3, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q4, q4, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q5, q5, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q6, q6, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q7, q7, %q[alpha]\n" /* mul alpha */
          "0: \n"
          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
          "q2=r04,r14,r05,r15\n"
          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
          "q6=r24,r34,r25,r35\n"

          "vswp     d1, d8                  @ swap d1, d8, q0=r00,r10,r20,r30; "
          "q4=r01,r11,r21,r31\n"
          "vst1.32  {d0-d1},  [%[outptr]]!  @ write q0:r00,r10,r20,r30\n"
          "vst1.32  {d8-d9},  [%[outptr]]!  @ write q4:r01,r11,r21,r31\n"

          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
          "q3=r06,r16,r07,r17\n"
          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
          "q7=r26,r36,r27,r37\n"

          "vswp     d3, d10                 @ swap d3, d10, "
          "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n"
          "vst1.32  {d2-d3},  [%[outptr]]!  @ write q1:r02,r12,r22,r32\n"
          "vst1.32  {d10-d11},[%[outptr]]!  @ write q5:r03,r13,r23,r33\n"

          "vswp     d5, d12                 @ swap d5, d12,q2=r04,r14,r24,r34; "
          "q6=r05,r15,r25,r35\n"
          "vst1.32  {d4-d5},  [%[outptr]]!  @ write q2:r04,r14,r24,r34\n"
          "vst1.32  {d12-d13},[%[outptr]]!  @ write q6:r05,r15,r25,r35\n"

          "vswp     d7, d14                 @ swap d7, d14, "
          "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n"
          "vst1.32  {d6-d7},  [%[outptr]]!  @ write q3:r06,r16,r26,r36\n"
          "vst1.32  {d14-d15},[%[outptr]]!  @ write q7:r07,r17,r27,r37\n"
          : [inptr0] "+r"(inptr0),
            [inptr1] "+r"(inptr1),
            [inptr2] "+r"(inptr2),
            [inptr3] "+r"(inptr3),
            [outptr] "+r"(outptr)
          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "cc", "memory");
    }

    for (; x > 0; x--) {
      if (has_alpha) {
        *outptr++ = *inptr0++ * alpha;
        *outptr++ = *inptr1++ * alpha;
        *outptr++ = *inptr2++ * alpha;
        *outptr++ = *inptr3++ * alpha;
      } else {
        *outptr++ = *inptr0++;
        *outptr++ = *inptr1++;
        *outptr++ = *inptr2++;
        *outptr++ = *inptr3++;
      }
    }
  }
}

void prepackA_trans_4x8(float* outptr,
                        const float* in,
                        float alpha,
                        int ldin,
                        int m0,
                        int mmax,
                        int k0,
                        int kmax) {
  auto inptr = in + k0 * ldin + m0;
  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
  float32x4_t valpha = vdupq_n_f32(alpha);

  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  int x_len = mmax - m0;
  int y_len = kmax - k0;
  int right_remain = x_len - 4 * (x_len / 4);
  int right_pad = 4 - right_remain;
  if (right_remain == 0) {
    right_pad = 0;
  }

  int stride_out = 4 * y_len;

  float32x4_t vzero = vdupq_n_f32(0.f);
  uint32x4_t vmask1 =
      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));

#pragma omp parallel for
  for (int y = 0; y < y_len - 3; y += 4) {
    const float* ptr0 = inptr + y * ldin;
    const float* ptr1 = ptr0 + ldin;
    const float* ptr2 = ptr1 + ldin;
    const float* ptr3 = ptr2 + ldin;

    float* outptr_row_col = outptr + y * 4;
    int i = 0;
    for (; i < x_len - 3; i += 4) {
      float* ptr_out = outptr_row_col;
      asm volatile(
          "vld1.32 {d0-d1}, [%[ptr0]]!        @ load r0, 4 elements\n"
          "vld1.32 {d2-d3}, [%[ptr1]]!        @ load r1, 4 elements\n"
          "vld1.32 {d4-d5}, [%[ptr2]]!        @ load r2, 4 elements\n"
          "vld1.32 {d6-d7}, [%[ptr3]]!        @ load r3, 4 elements\n"
          "cmp %[has_alpha], #0\n"
          "beq  0f\n"                      /* check whether alpha == 1? */
          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q1, q1, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q2, q2, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q3, q3, %q[alpha]\n" /* mul alpha */
          "0: \n"
          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d2-d3}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d4-d5}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d6-d7}, [%[outptr]]!      @ write to output ptr\n"
          : [outptr] "+r"(ptr_out),
            [ptr0] "+r"(ptr0),
            [ptr1] "+r"(ptr1),
            [ptr2] "+r"(ptr2),
            [ptr3] "+r"(ptr3)
          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
          : "q0", "q1", "q2", "q3", "cc", "memory");
      outptr_row_col += stride_out;
    }
    if (right_pad > 0) {
      float* ptr_out = outptr_row_col;
      asm volatile(
          "vld1.32 {d0-d1}, [%[ptr0]]!        @ load r0, 4 elements\n"
          "vld1.32 {d2-d3}, [%[ptr1]]!        @ load r1, 4 elements\n"
          "vld1.32 {d4-d5}, [%[ptr2]]!        @ load r2, 4 elements\n"
          "vld1.32 {d6-d7}, [%[ptr3]]!        @ load r3, 4 elements\n"
          "cmp %[has_alpha], #0\n"
          "beq  0f\n"                      /* check whether alpha == 1? */
          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q1, q1, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q2, q2, %q[alpha]\n" /* mul alpha */
          "vmul.f32   q3, q3, %q[alpha]\n" /* mul alpha */
          "0: \n"
          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   q1, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   q3, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d2-d3}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d4-d5}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d6-d7}, [%[outptr]]!      @ write to output ptr\n"
          : [outptr] "+r"(ptr_out),
            [ptr0] "+r"(ptr0),
            [ptr1] "+r"(ptr1),
            [ptr2] "+r"(ptr2),
            [ptr3] "+r"(ptr3)
          : [vmask1] "w"(vmask1),
            [vzero] "w"(vzero),
            [has_alpha] "r"(has_alpha),
            [alpha] "w"(valpha)
          : "q0", "q1", "q2", "q3", "cc", "memory");
    }
  }

#pragma omp parallel for
  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
    const float* ptr0 = inptr + y * ldin;
    float* outptr_row_col = outptr + y * 4;
    int i = 0;
    for (; i < x_len - 3; i += 4) {
      float* ptr_out = outptr_row_col;
      // clang-format off
      asm volatile(
          "vld1.32 {d0-d1}, [%[ptr0]]!        @ load r0, 4 elements\n"
          "cmp %[has_alpha], #0\n"
          "beq  0f\n"                      /* check whether alpha == 1? */
          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
          "0: \n"
          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
          : "q0", "q1", "cc", "memory");
      // clang-format on
      outptr_row_col += stride_out;
    }
    if (right_pad > 0) {
      float* ptr_out = outptr_row_col;
      // clang-format off
      asm volatile(
          "vld1.32 {d0-d1}, [%[ptr0]]!        @ load r0, 4 elements\n"
          "cmp %[has_alpha], #0\n"
          "beq  0f\n"                      /* check whether alpha == 1? */
          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
          "0: \n"
          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
          : [vmask1] "w"(vmask1),
            [vzero] "w"(vzero),
            [has_alpha] "r"(has_alpha),
            [alpha] "w"(valpha)
          : "q0", "q1", "cc", "memory");
      // clang-format on
    }
  }
}

#endif  // __aarch64__

/**
* \brief input data is transpose
* for arm-v7a, transform data to block x k x 8 layout
* for arm-v8a, transform data to block x k x 12 layout or block x k x 8 layout
*/
#ifdef __aarch64__
void loadb(
    float *out, const float *in, int ldin, int k0, int kmax, int n0, int nmax) {
  auto outptr = reinterpret_cast<uint32_t *>(out);
  auto inptr = reinterpret_cast<const uint32_t *>(in) + k0 * ldin + n0;
  uint32_t mask_buffer[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
  int x_len = nmax - n0;
  int y_len = kmax - k0;
  int right_remain = x_len - 12 * (x_len / 12);
  int right_pad = 12 - right_remain;

  uint32_t *outptr_row = outptr;
  int stride_out = 12 * y_len;

  uint32x4_t vzero = vdupq_n_u32(0);
  uint32x4_t vmask1 =
      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
  uint32x4_t vmask2 =
      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
  uint32x4_t vmask3 =
      vcltq_u32(vld1q_u32(mask_buffer + 8), vdupq_n_u32(right_remain));

#pragma omp parallel for
  for (int y = 0; y < y_len - 3; y += 4) {
    const uint32_t *ptr0 = inptr + y * ldin;
    const uint32_t *ptr1 = ptr0 + ldin;
    const uint32_t *ptr2 = ptr1 + ldin;
    const uint32_t *ptr3 = ptr2 + ldin;
    asm volatile(
        "prfm   pldl1keep, [%[ptr0]]                \n"
        "prfm   pldl1keep, [%[ptr0], #64]   \n"
        "prfm   pldl1keep, [%[ptr1]]        \n"
        "prfm   pldl1keep, [%[ptr1], #64]   \n"
        "prfm   pldl1keep, [%[ptr2]]        \n"
        "prfm   pldl1keep, [%[ptr2], #64]   \n"
        "prfm   pldl1keep, [%[ptr3]]        \n"
        "prfm   pldl1keep, [%[ptr3], #64]   \n"
        :
        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
        : "memory");

    uint32_t *outptr_row_col = outptr_row + y * 12;

    int i = 0;
    for (; i < x_len - 11; i += 12) {
      uint32x4_t vr00 = vld1q_u32(ptr0);
      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
      uint32x4_t vr02 = vld1q_u32(ptr0 + 8);

      uint32x4_t vr10 = vld1q_u32(ptr1);
      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
      uint32x4_t vr12 = vld1q_u32(ptr1 + 8);

      vst1q_u32(outptr_row_col, vr00);
      vst1q_u32(outptr_row_col + 4, vr01);
      vst1q_u32(outptr_row_col + 8, vr02);

      uint32x4_t vr20 = vld1q_u32(ptr2);
      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
      uint32x4_t vr22 = vld1q_u32(ptr2 + 8);

      vst1q_u32(outptr_row_col + 12, vr10);
      vst1q_u32(outptr_row_col + 16, vr11);
      vst1q_u32(outptr_row_col + 20, vr12);

      uint32x4_t vr30 = vld1q_u32(ptr3);
      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
      uint32x4_t vr32 = vld1q_u32(ptr3 + 8);

      vst1q_u32(outptr_row_col + 24, vr20);
      vst1q_u32(outptr_row_col + 28, vr21);
      vst1q_u32(outptr_row_col + 32, vr22);

      vst1q_u32(outptr_row_col + 36, vr30);
      vst1q_u32(outptr_row_col + 40, vr31);
      vst1q_u32(outptr_row_col + 44, vr32);

      ptr0 += 12;
      ptr1 += 12;
      ptr2 += 12;
      ptr3 += 12;

      outptr_row_col += stride_out;
    }
    if (right_remain > 0) {
      uint32x4_t vr00 = vld1q_u32(ptr0);
      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
      uint32x4_t vr02 = vld1q_u32(ptr0 + 8);

      uint32x4_t vr10 = vld1q_u32(ptr1);
      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
      uint32x4_t vr12 = vld1q_u32(ptr1 + 8);

      uint32x4_t vr00_1 = vbslq_u32(vmask1, vr00, vzero);
      uint32x4_t vr01_1 = vbslq_u32(vmask2, vr01, vzero);
      uint32x4_t vr02_1 = vbslq_u32(vmask3, vr02, vzero);

      uint32x4_t vr20 = vld1q_u32(ptr2);
      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
      uint32x4_t vr22 = vld1q_u32(ptr2 + 8);

      vst1q_u32(outptr_row_col, vr00_1);
      vst1q_u32(outptr_row_col + 4, vr01_1);
      vst1q_u32(outptr_row_col + 8, vr02_1);

      uint32x4_t vr10_1 = vbslq_u32(vmask1, vr10, vzero);
      uint32x4_t vr11_1 = vbslq_u32(vmask2, vr11, vzero);
      uint32x4_t vr12_1 = vbslq_u32(vmask3, vr12, vzero);

      uint32x4_t vr30 = vld1q_u32(ptr3);
      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
      uint32x4_t vr32 = vld1q_u32(ptr3 + 8);

      vst1q_u32(outptr_row_col + 12, vr10_1);
      vst1q_u32(outptr_row_col + 16, vr11_1);
      vst1q_u32(outptr_row_col + 20, vr12_1);

      uint32x4_t vr20_1 = vbslq_u32(vmask1, vr20, vzero);
      uint32x4_t vr21_1 = vbslq_u32(vmask2, vr21, vzero);
      uint32x4_t vr22_1 = vbslq_u32(vmask3, vr22, vzero);

      uint32x4_t vr30_1 = vbslq_u32(vmask1, vr30, vzero);
      uint32x4_t vr31_1 = vbslq_u32(vmask2, vr31, vzero);
      uint32x4_t vr32_1 = vbslq_u32(vmask3, vr32, vzero);

      vst1q_u32(outptr_row_col + 24, vr20_1);
      vst1q_u32(outptr_row_col + 28, vr21_1);
      vst1q_u32(outptr_row_col + 32, vr22_1);

      vst1q_u32(outptr_row_col + 36, vr30_1);
      vst1q_u32(outptr_row_col + 40, vr31_1);
      vst1q_u32(outptr_row_col + 44, vr32_1);
    }
  }

#pragma omp parallel for
  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
    const uint32_t *ptr0 = inptr + y * ldin;
    uint32_t *outptr_row_col = outptr_row + y * 12;

    int i = 0;
    for (; i < x_len - 11; i += 12) {
      uint32x4_t vr0 = vld1q_u32(ptr0);
      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
      uint32x4_t vr2 = vld1q_u32(ptr0 + 8);
      vst1q_u32(outptr_row_col, vr0);
      vst1q_u32(outptr_row_col + 4, vr1);
      vst1q_u32(outptr_row_col + 8, vr2);

      ptr0 += 12;

      outptr_row_col += stride_out;
    }
    if (right_remain > 0) {
      uint32x4_t vr0 = vld1q_u32(ptr0);
      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
      uint32x4_t vr2 = vld1q_u32(ptr0 + 8);

      uint32x4_t vr0_1 = vbslq_u32(vmask1, vr0, vzero);
      uint32x4_t vr1_1 = vbslq_u32(vmask2, vr1, vzero);
      uint32x4_t vr2_1 = vbslq_u32(vmask3, vr2, vzero);

      vst1q_u32(outptr_row_col, vr0_1);
      vst1q_u32(outptr_row_col + 4, vr1_1);
      vst1q_u32(outptr_row_col + 8, vr2_1);
    }
  }
}

void loadb_trans(
    float *out, const float *in, int ldin, int k0, int kmax, int n0, int nmax) {
  int x_len = kmax - k0;
  uint32_t zerobuff[x_len];  // NOLINT
  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
  auto outptr = reinterpret_cast<uint32_t *>(out);
  auto inptr = reinterpret_cast<const uint32_t *>(in);

  //! data B is not transposed, transpose B to k * 12
  for (int y = n0; y < nmax; y += 12) {
    const uint32_t *inptr0 = inptr + y * ldin + k0;
    const uint32_t *inptr1 = inptr0 + ldin;
    const uint32_t *inptr2 = inptr1 + ldin;
    const uint32_t *inptr3 = inptr2 + ldin;
    const uint32_t *inptr4 = inptr3 + ldin;
    const uint32_t *inptr5 = inptr4 + ldin;
    const uint32_t *inptr6 = inptr5 + ldin;
    const uint32_t *inptr7 = inptr6 + ldin;
    const uint32_t *inptr8 = inptr7 + ldin;
    const uint32_t *inptr9 = inptr8 + ldin;
    const uint32_t *inptr10 = inptr9 + ldin;
    const uint32_t *inptr11 = inptr10 + ldin;

    asm volatile(
        "prfm   pldl1keep, [%[ptr0]]        \n"
        "prfm   pldl1keep, [%[ptr0], #64]   \n"
        "prfm   pldl1keep, [%[ptr1]]        \n"
        "prfm   pldl1keep, [%[ptr1], #64]   \n"
        "prfm   pldl1keep, [%[ptr2]]        \n"
        "prfm   pldl1keep, [%[ptr2], #64]   \n"
        "prfm   pldl1keep, [%[ptr3]]        \n"
        "prfm   pldl1keep, [%[ptr3], #64]   \n"
        "prfm   pldl1keep, [%[ptr4]]        \n"
        "prfm   pldl1keep, [%[ptr4], #64]   \n"
        "prfm   pldl1keep, [%[ptr5]]        \n"
        "prfm   pldl1keep, [%[ptr5], #64]   \n"
        "prfm   pldl1keep, [%[ptr6]]        \n"
        "prfm   pldl1keep, [%[ptr6], #64]   \n"
        "prfm   pldl1keep, [%[ptr7]]        \n"
        "prfm   pldl1keep, [%[ptr7], #64]   \n"
        "prfm   pldl1keep, [%[ptr8]]        \n"
        "prfm   pldl1keep, [%[ptr8], #64]   \n"
        "prfm   pldl1keep, [%[ptr9]]        \n"
        "prfm   pldl1keep, [%[ptr9], #64]   \n"
        "prfm   pldl1keep, [%[ptr10]]        \n"
        "prfm   pldl1keep, [%[ptr10], #64]   \n"
        "prfm   pldl1keep, [%[ptr11]]        \n"
        "prfm   pldl1keep, [%[ptr11], #64]   \n"
        :
        : [ptr0] "r"(inptr0),
          [ptr1] "r"(inptr1),
          [ptr2] "r"(inptr2),
          [ptr3] "r"(inptr3),
          [ptr4] "r"(inptr4),
          [ptr5] "r"(inptr5),
          [ptr6] "r"(inptr6),
          [ptr7] "r"(inptr7),
          [ptr8] "r"(inptr8),
          [ptr9] "r"(inptr9),
          [ptr10] "r"(inptr10),
          [ptr11] "r"(inptr11)
        : "memory");

    int x = x_len;

    //! cope with row index exceed real size, set to zero buffer
    if ((y + 11) >= nmax) {
      switch ((y + 11) - nmax) {
        case 10:
          inptr1 = zerobuff;
        case 9:
          inptr2 = zerobuff;
        case 8:
          inptr3 = zerobuff;
        case 7:
          inptr4 = zerobuff;
        case 6:
          inptr5 = zerobuff;
        case 5:
          inptr6 = zerobuff;
        case 4:
          inptr7 = zerobuff;
        case 3:
          inptr8 = zerobuff;
        case 2:
          inptr9 = zerobuff;
        case 1:
          inptr10 = zerobuff;
        case 0:
          inptr11 = zerobuff;
        default:
          break;
      }
    }
    for (; x > 7; x -= 8) {
      // clang-format off
      asm volatile(
          "ldp    q0, q1, [%[inptr0]], #32\n" /* r0, a0~a7 */
          "ldp    q2, q3, [%[inptr1]], #32\n" /* r1, b0~b7 */
          "ldp    q4, q5, [%[inptr2]], #32\n" /* r2, c0~c7 */
          "ldp    q6, q7, [%[inptr3]], #32\n" /* r3, d0~d7 */

          "zip1   v16.4s, v0.4s, v4.4s\n" /* a0c0a1c1 */
          "zip1   v17.4s, v2.4s, v6.4s\n" /* b0d0b1d1 */
          "prfm   pldl1keep, [%[inptr0], #128] \n"

          "ldp    q8, q9, [%[inptr4]], #32\n"   /* r4, e0~e7 */
          "ldp    q10, q11, [%[inptr5]], #32\n" /* r5, f0~f7 */
          "ldp    q12, q13, [%[inptr6]], #32\n" /* r6, g0~g7 */
          "ldp    q14, q15, [%[inptr7]], #32\n" /* r7, h0~h7 */

          "zip1   v18.4s, v8.4s, v12.4s\n"  /* e0g0e1g1 */
          "zip1   v19.4s, v10.4s, v14.4s\n" /* f0h0f1h1 */
          "prfm   pldl1keep, [%[inptr1], #128]\n"
          "zip1   v20.4s, v16.4s, v17.4s\n" /* a0b0c0d0 */
          "zip1   v21.4s, v18.4s, v19.4s\n" /* e0f0g0h0 */
          "prfm   pldl1keep, [%[inptr2], #128]\n"
          "zip2   v22.4s, v16.4s, v17.4s\n" /* a1b1c1d1 */
          "zip2   v23.4s, v18.4s, v19.4s\n" /* e1f1g1h1 */

          "ldp    q24, q25, [%[inptr8]], #32\n"  /* r8, i0~i7 */
          "ldp    q26, q27, [%[inptr9]], #32\n"  /* r9, j0~j7 */
          "ldp    q28, q29, [%[inptr10]], #32\n" /* r10, k0~k7 */
          "ldp    q30, q31, [%[inptr11]], #32\n" /* r11, l0~l7 */

          "stp    q20, q21, [%[outptr]], #32\n" /* save a0~h0 */
          "prfm   pldl1keep, [%[inptr3], #128]\n"

          "zip1   v16.4s, v24.4s, v28.4s\n" /* i0k0i1k1 */
          "zip1   v17.4s, v26.4s, v30.4s\n" /* j0l0j1l1 */
          "prfm   pldl1keep, [%[inptr4], #128]\n"
          "zip1   v18.4s, v16.4s, v17.4s\n" /* i0j0k0l0 */
          "zip2   v19.4s, v16.4s, v17.4s\n" /* i1j1k1l1 */
          "prfm   pldl1keep, [%[inptr5], #128]\n"
          "zip2   v16.4s, v0.4s, v4.4s\n" /* a2c2a3c3 */
          "zip2   v17.4s, v2.4s, v6.4s\n" /* b2d2b3d3 */

          "str    q18, [%[outptr]], #16\n"      /* save j0~l0 */
          "stp    q22, q23, [%[outptr]], #32\n" /* save a1~h1 */
          "str    q19, [%[outptr]], #16\n"      /* save j1~l1 */

          "zip2   v18.4s, v8.4s, v12.4s\n"  /* e2g2e3g3 */
          "zip2   v19.4s, v10.4s, v14.4s\n" /* f2h2f3h3 */
          "prfm   pldl1keep, [%[inptr6], #128]\n"
          "zip1   v20.4s, v16.4s, v17.4s\n" /* a2b2c2d2 */
          "zip1   v21.4s, v18.4s, v19.4s\n" /* e2f2g2h2 */
          "prfm   pldl1keep, [%[inptr7], #128]\n"
          "zip2   v22.4s, v16.4s, v17.4s\n" /* a3b3c3d3 */
          "zip2   v23.4s, v18.4s, v19.4s\n" /* e3f3g3h3 */
          "prfm   pldl1keep, [%[inptr8], #128]\n"
          "zip2   v16.4s, v24.4s, v28.4s\n" /* i2k2i3k3 */
          "zip2   v17.4s, v26.4s, v30.4s\n" /* j2l2j3l3 */

          "stp    q20, q21, [%[outptr]], #32\n" /* save a2~h2 */

          "zip1   v18.4s, v16.4s, v17.4s\n" /* i2j2k2l2 */
          "zip2   v19.4s, v16.4s, v17.4s\n" /* i3j3k3l3 */
          "prfm   pldl1keep, [%[inptr9], #128]\n"
          "zip1   v16.4s, v1.4s, v5.4s\n" /* a4c4a5c5 */
          "zip1   v17.4s, v3.4s, v7.4s\n" /* b4d4b5d5 */

          "str    q18, [%[outptr]], #16\n"      /* save i2~l2 */
          "stp    q22, q23, [%[outptr]], #32\n" /* save a3~h3 */
          "str    q19, [%[outptr]], #16\n"      /* save i3~l3 */

          "zip1   v18.4s, v9.4s, v13.4s\n"  /* e4g4e5g5 */
          "zip1   v19.4s, v11.4s, v15.4s\n" /* f4h4f5h5 */
          "prfm   pldl1keep, [%[inptr10], #128]\n"
          "zip1   v20.4s, v16.4s, v17.4s\n" /* a4b4c4d4 */
          "zip1   v21.4s, v18.4s, v19.4s\n" /* e4f4g4h4 */
          "prfm   pldl1keep, [%[inptr11], #128]\n"
          "zip2   v22.4s, v16.4s, v17.4s\n" /* a5b5c5d5 */
          "zip2   v23.4s, v18.4s, v19.4s\n" /* e5f5g5h5 */
          "zip1   v16.4s, v25.4s, v29.4s\n" /* i4k4i5k5 */
          "zip1   v17.4s, v27.4s, v31.4s\n" /* j4l4j5l5 */

          "stp    q20, q21, [%[outptr]], #32\n" /* save a4~h4 */

          "zip1   v18.4s, v16.4s, v17.4s\n" /* i4j4k4l4 */
          "zip2   v19.4s, v16.4s, v17.4s\n" /* i5j5k5l5 */
          "zip2   v16.4s, v1.4s, v5.4s\n"   /* a6c6a7c7 */
          "zip2   v17.4s, v3.4s, v7.4s\n"   /* b6d6b7d7 */

          "str    q18, [%[outptr]], #16\n"      /* save i4~l4 */
          "stp    q22, q23, [%[outptr]], #32\n" /* save a5~h5 */
          "str    q19, [%[outptr]], #16\n"      /* save i5~l5 */

          "zip2   v18.4s, v9.4s, v13.4s\n"  /* e6g6e7g7 */
          "zip2   v19.4s, v11.4s, v15.4s\n" /* f6h6f7h7 */
          "zip1   v20.4s, v16.4s, v17.4s\n" /* a6b6c6d6 */
          "zip1   v21.4s, v18.4s, v19.4s\n" /* e6f6g6h6 */
          "zip2   v22.4s, v16.4s, v17.4s\n" /* a7b7c7d7 */
          "zip2   v23.4s, v18.4s, v19.4s\n" /* e7f7g7h7 */
          "zip2   v16.4s, v25.4s, v29.4s\n" /* i6k6i7k7 */
          "zip2   v17.4s, v27.4s, v31.4s\n" /* j6l6j7l7 */

          "stp    q20, q21, [%[outptr]], #32\n" /* save a6~h6 */

          "zip1   v18.4s, v16.4s, v17.4s\n" /* i6j6k6l6 */
          "zip2   v19.4s, v16.4s, v17.4s\n" /* i7j7k7l7 */

          "str    q18, [%[outptr]], #16\n"      /* save i6~l6 */
          "stp    q22, q23, [%[outptr]], #32\n" /* save a7~h7 */
          "str    q19, [%[outptr]], #16\n"      /* save i7~l7 */
          : [inptr0] "+r"(inptr0),
            [inptr1] "+r"(inptr1),
            [inptr2] "+r"(inptr2),
            [inptr3] "+r"(inptr3),
            [inptr4] "+r"(inptr4),
            [inptr5] "+r"(inptr5),
            [inptr6] "+r"(inptr6),
            [inptr7] "+r"(inptr7),
            [inptr8] "+r"(inptr8),
            [inptr9] "+r"(inptr9),
            [inptr10] "+r"(inptr10),
            [inptr11] "+r"(inptr11),
            [outptr] "+r"(outptr)
          :
          : "v0","v1","v2","v3","v4","v5",
            "v6","v7","v8","v9","v10","v11","v12",
            "v13","v14","v15","v16","v17","v18","v19",
            "v20","v21","v22","v23","v24","v25","v26",
            "v27","v28","v29","v30","v31","cc","memory");
      // clang-format on
    }

    for (; x > 0; x--) {
      *outptr++ = *inptr0++;
      *outptr++ = *inptr1++;
      *outptr++ = *inptr2++;
      *outptr++ = *inptr3++;
      *outptr++ = *inptr4++;
      *outptr++ = *inptr5++;
      *outptr++ = *inptr6++;
      *outptr++ = *inptr7++;
      *outptr++ = *inptr8++;
      *outptr++ = *inptr9++;
      *outptr++ = *inptr10++;
      *outptr++ = *inptr11++;
    }
  }
}
void loadb_eight(
    float *out, const float *in, int ldin, int k0, int kmax, int n0, int nmax) {
  auto outptr = reinterpret_cast<uint32_t *>(out);
  auto inptr = reinterpret_cast<const uint32_t *>(in) + k0 * ldin + n0;
  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  int x_len = nmax - n0;
  int y_len = kmax - k0;
  int right_remain = x_len - 8 * (x_len / 8);
  int right_pad = 8 - right_remain;

  uint32_t *outptr_row = outptr;
  int stride_out = 8 * y_len;

  uint32x4_t vzero = vdupq_n_u32(0);
  uint32x4_t vmask1 =
      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
  uint32x4_t vmask2 =
      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));

#pragma omp parallel for
  for (int y = 0; y < y_len - 3; y += 4) {
    const uint32_t *ptr0 = inptr + y * ldin;
    const uint32_t *ptr1 = ptr0 + ldin;
    const uint32_t *ptr2 = ptr1 + ldin;
    const uint32_t *ptr3 = ptr2 + ldin;
    uint32_t *outptr_row_col = outptr_row + y * 8;
    asm volatile(
        "prfm   pldl1keep, [%[ptr0]]                \n"
        "prfm   pldl1keep, [%[ptr0], #64]   \n"
        "prfm   pldl1keep, [%[ptr1]]        \n"
        "prfm   pldl1keep, [%[ptr1], #64]   \n"
        "prfm   pldl1keep, [%[ptr2]]        \n"
        "prfm   pldl1keep, [%[ptr2], #64]   \n"
        "prfm   pldl1keep, [%[ptr3]]        \n"
        "prfm   pldl1keep, [%[ptr3], #64]   \n"
        :
        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
        : "memory");
    int i = 0;
    for (; i < x_len - 7; i += 8) {
      uint32_t *ptr_out = outptr_row_col;
      asm volatile(
          "ldp q0, q1, [%[ptr0]], #32\n"    // load r0, 8 elements
          "ldp q2, q3, [%[ptr1]], #32\n"    // load r1, 8 elements
          "stp q0, q1, [%[outptr]], #32\n"  // write to output ptr
          "stp q2, q3, [%[outptr]], #32\n"  // write to output ptr
          "ldp q0, q1, [%[ptr2]], #32\n"    // load r0, 8 elements
          "ldp q2, q3, [%[ptr3]], #32\n"    // load r1, 8 elements
          "stp q0, q1, [%[outptr]], #32\n"  // write to output ptr
          "stp q2, q3, [%[outptr]], #32\n"  // write to output ptr
          : [outptr] "+r"(ptr_out),
            [ptr0] "+r"(ptr0),
            [ptr1] "+r"(ptr1),
            [ptr2] "+r"(ptr2),
            [ptr3] "+r"(ptr3)
          :
          : "v0", "v1", "v2", "v3", "cc", "memory");
      outptr_row_col += stride_out;
    }
    if (right_remain > 0) {
      uint32_t *ptr_out = outptr_row_col;
      asm volatile(
          "ldp q0, q1, [%[ptr0]], #32\n"
          "ldp q2, q3, [%[ptr1]], #32\n"
          "bif v0.16b, %[vzero].16b, %[vmask1].16b\n"
          "bif v1.16b, %[vzero].16b, %[vmask2].16b\n"
          "bif v2.16b, %[vzero].16b, %[vmask1].16b\n"
          "bif v3.16b, %[vzero].16b, %[vmask2].16b\n"
          "stp q0, q1, [%[outptr]], #32\n"
          "ldp q0, q1, [%[ptr2]], #32\n"
          "stp q2, q3, [%[outptr]], #32\n"
          "ldp q2, q3, [%[ptr3]], #32\n"
          "bif v0.16b, %[vzero].16b, %[vmask1].16b\n"
          "bif v1.16b, %[vzero].16b, %[vmask2].16b\n"
          "bif v2.16b, %[vzero].16b, %[vmask1].16b\n"
          "bif v3.16b, %[vzero].16b, %[vmask2].16b\n"
          "stp q0, q1, [%[outptr]], #32\n"
          "stp q2, q3, [%[outptr]], #32\n"
          : [outptr] "+r"(ptr_out),
            [ptr0] "+r"(ptr0),
            [ptr1] "+r"(ptr1),
            [ptr2] "+r"(ptr2),
            [ptr3] "+r"(ptr3)
          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
          : "v0", "v1", "v2", "v3", "cc", "memory");
    }
  }
#pragma omp parallel for
  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
    const uint32_t *ptr0 = inptr + y * ldin;
    uint32_t *outptr_row_col = outptr_row + y * 8;
    int i = 0;
    for (; i < x_len - 7; i += 8) {
      uint32_t *ptr_out = outptr_row_col;
      asm volatile(
          "ldp q0, q1, [%[ptr0]], #32\n"
          "stp q0, q1, [%[outptr]], #32\n"
          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
          :
          : "v0", "v1", "cc", "memory");
      outptr_row_col += stride_out;
    }
    if (right_remain > 0) {
      uint32_t *ptr_out = outptr_row_col;
      asm volatile(
          "ldp q0, q1, [%[ptr0]], #32\n"
          "bif v0.16b, %[vzero].16b, %[vmask1].16b\n"
          "bif v1.16b, %[vzero].16b, %[vmask2].16b\n"
          "stp q0, q1, [%[outptr]], #32\n"
          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
          : "v0", "v1", "cc", "memory");
    }
  }
}

void loadb_trans_eight(
    float *out, const float *in, int ldin, int k0, int kmax, int n0, int nmax) {
  int x_len = kmax - k0;
  uint32_t zerobuff[x_len];  // NOLINT
  memset(zerobuff, 0, sizeof(uint32_t) * x_len);

  auto outptr = reinterpret_cast<uint32_t *>(out);
  auto inptr = reinterpret_cast<const uint32_t *>(in);
  //! data B is not transposed, transpose B to k * 8
  for (int y = n0; y < nmax; y += 8) {
    const uint32_t *inptr0 = inptr + y * ldin + k0;
    const uint32_t *inptr1 = inptr0 + ldin;
    const uint32_t *inptr2 = inptr1 + ldin;
    const uint32_t *inptr3 = inptr2 + ldin;
    const uint32_t *inptr4 = inptr3 + ldin;
    const uint32_t *inptr5 = inptr4 + ldin;
    const uint32_t *inptr6 = inptr5 + ldin;
    const uint32_t *inptr7 = inptr6 + ldin;

    int x = x_len;
    asm volatile(
        "prfm   pldl1keep, [%[ptr0]]        \n"
        "prfm   pldl1keep, [%[ptr0], #64]   \n"
        "prfm   pldl1keep, [%[ptr1]]        \n"
        "prfm   pldl1keep, [%[ptr1], #64]   \n"
        "prfm   pldl1keep, [%[ptr2]]        \n"
        "prfm   pldl1keep, [%[ptr2], #64]   \n"
        "prfm   pldl1keep, [%[ptr3]]        \n"
        "prfm   pldl1keep, [%[ptr3], #64]   \n"
        "prfm   pldl1keep, [%[ptr4]]        \n"
        "prfm   pldl1keep, [%[ptr4], #64]   \n"
        "prfm   pldl1keep, [%[ptr5]]        \n"
        "prfm   pldl1keep, [%[ptr5], #64]   \n"
        "prfm   pldl1keep, [%[ptr6]]        \n"
        "prfm   pldl1keep, [%[ptr6], #64]   \n"
        "prfm   pldl1keep, [%[ptr7]]        \n"
        "prfm   pldl1keep, [%[ptr7], #64]   \n"
        :
        : [ptr0] "r"(inptr0),
          [ptr1] "r"(inptr1),
          [ptr2] "r"(inptr2),
          [ptr3] "r"(inptr3),
          [ptr4] "r"(inptr4),
          [ptr5] "r"(inptr5),
          [ptr6] "r"(inptr6),
          [ptr7] "r"(inptr7)
        : "memory");

    //! cope with row index exceed real size, set to zero buffer
    if ((y + 7) >= nmax) {
      switch ((y + 7) - nmax) {
        case 6:
          inptr1 = zerobuff;
        case 5:
          inptr2 = zerobuff;
        case 4:
          inptr3 = zerobuff;
        case 3:
          inptr4 = zerobuff;
        case 2:
          inptr5 = zerobuff;
        case 1:
          inptr6 = zerobuff;
        case 0:
          inptr7 = zerobuff;
        default:
          break;
      }
    }

    for (; x > 7; x -= 8) {
      // clang-format off
      //! zip load 8 elements (2 neon Q registers) from each of 8 rows
      asm volatile(
          "ldp  q0, q1, [%[inptr0]], #32\n"  // load r0, a0~a7
          "ldp  q2, q3, [%[inptr1]], #32\n"  // load r1, b0~b7
          "ldp  q4, q5, [%[inptr2]], #32\n"  // load r2, c0~c7
          "ldp  q6, q7, [%[inptr3]], #32\n"  // load r3, d0~d7
          "trn1 v8.4s, v0.4s, v2.4s\n"       // a0b0a2b2
          "trn2 v9.4s, v0.4s, v2.4s\n"       // a1b1a3b3
          "trn1 v10.4s, v1.4s, v3.4s\n"      // a4b4a6b6
          "trn2 v11.4s, v1.4s, v3.4s\n"      // a5b5a7b7
          "ldp  q16, q17, [%[inptr4]], #32\n"// load r4, e0~e7
          "ldp  q18, q19, [%[inptr5]], #32\n"// load r5, f0~f7
          "ldp  q20, q21, [%[inptr6]], #32\n"// load r6, g0~g7
          "ldp  q22, q23, [%[inptr7]], #32\n"// load r7, h0~h7
          "trn1 v12.4s, v4.4s, v6.4s\n"      // c0d0c2d2
          "trn2 v13.4s, v4.4s, v6.4s\n"      // c1d1c3d3
          "trn1 v14.4s, v5.4s, v7.4s\n"      // c4d4c6d6
          "trn2 v15.4s, v5.4s, v7.4s\n"      // c5d5c7d7

          "trn1 v24.4s, v16.4s, v18.4s\n"    // e0f0e2f2
          "trn2 v25.4s, v16.4s, v18.4s\n"    // e1f1e3f3
          "trn1 v28.4s, v20.4s, v22.4s\n"    // g0h0e2f2
          "trn2 v29.4s, v20.4s, v22.4s\n"    // g1h1e3f3
          "trn1 v26.4s, v17.4s, v19.4s\n"    // e4f4e6f6
          "trn2 v27.4s, v17.4s, v19.4s\n"    // e5f5e7f7
          "trn1 v30.4s, v21.4s, v23.4s\n"    // g4h4e6f6
          "trn2 v31.4s, v21.4s, v23.4s\n"    // g5h5e7f7

          "trn1 v0.2d, v8.2d, v12.2d\n"      // a0b0c0d0
          "trn1 v1.2d, v24.2d, v28.2d\n"     // e0f0g0h0
          "trn1 v2.2d, v9.2d, v13.2d\n"      // a1b1c1d1
          "trn1 v3.2d, v25.2d, v29.2d\n"     // e1f1g1h1
          "trn2 v4.2d, v8.2d, v12.2d\n"      // a2b2c2d2
          "trn2 v5.2d, v24.2d, v28.2d\n"     // e2f2g2h2
          "stp q0, q1, [%[outptr]], #32\n"   // save q0, q1, a0~h0
          "trn2 v6.2d, v9.2d, v13.2d\n"      // a3b3c3d3
          "trn2 v7.2d, v25.2d, v29.2d\n"     // e3f3g3h3
          "stp q2, q3, [%[outptr]], #32\n"   // save q0, q1, a1~h1

          "trn1 v16.2d, v10.2d, v14.2d\n"    // a4b4c4d4
          "trn1 v17.2d, v26.2d, v30.2d\n"    // e4f4g4h4
          "stp q4, q5, [%[outptr]], #32\n"   // save q0, q1, a2~h2
          "trn1 v18.2d, v11.2d, v15.2d\n"    // a5b5c5d5
          "trn1 v19.2d, v27.2d, v31.2d\n"    // e5f5g5h5
          "stp q6, q7, [%[outptr]], #32\n"   // save q0, q1, a3~h3
          "trn2 v20.2d, v10.2d, v14.2d\n"    // a6b6c6d6
          "trn2 v21.2d, v26.2d, v30.2d\n"    // e6f6g6h6
          "stp q16, q17, [%[outptr]], #32\n" // save q0, q1, a4~h4
          "trn2 v22.2d, v11.2d, v15.2d\n"    // a7b7c7d7
          "trn2 v23.2d, v27.2d, v31.2d\n"    // e7f7g7h7
          "stp q18, q19, [%[outptr]], #32\n" // save q0, q1, a5~h5
          "stp q20, q21, [%[outptr]], #32\n" // save q0, q1, a6~h6
          "stp q22, q23, [%[outptr]], #32\n" // save q0, q1, a7~h7
          : [inptr0] "+r"(inptr0),
            [inptr1] "+r"(inptr1),
            [inptr2] "+r"(inptr2),
            [inptr3] "+r"(inptr3),
            [inptr4] "+r"(inptr4),
            [inptr5] "+r"(inptr5),
            [inptr6] "+r"(inptr6),
            [inptr7] "+r"(inptr7),
            [outptr] "+r"(outptr)
          :
          : "v0","v1","v2","v3","v4","v5",
            "v6","v7","v8","v9","v10","v11","v12",
            "v13","v14","v15","v16","v17","v18","v19",
            "v20","v21","v22","v23","v24","v25","v26",
            "v27","v28","v29","v30","v31","cc","memory");
      // clang-format on
    }

    for (; x > 0; x--) {
      *outptr++ = *inptr0++;
      *outptr++ = *inptr1++;
      *outptr++ = *inptr2++;
      *outptr++ = *inptr3++;
      *outptr++ = *inptr4++;
      *outptr++ = *inptr5++;
      *outptr++ = *inptr6++;
      *outptr++ = *inptr7++;
    }
  }
}

#else  // __aarch64__
void loadb(
    float* out, const float* in, int ldin, int k0, int kmax, int n0, int nmax) {
  auto outptr = reinterpret_cast<uint32_t*>(out);
  auto inptr = reinterpret_cast<const uint32_t*>(in) + k0 * ldin + n0;
  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  int x_len = nmax - n0;
  int y_len = kmax - k0;
  int right_remain = x_len - 8 * (x_len / 8);
  int right_pad = 8 - right_remain;

  uint32_t* outptr_row = outptr;
  int stride_out = 8 * y_len;

  uint32x4_t vzero = vdupq_n_u32(0);
  uint32x4_t vmask1 =
      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
  uint32x4_t vmask2 =
      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));

#pragma omp parallel for
  for (int y = 0; y < y_len - 3; y += 4) {
    const uint32_t* ptr0 = inptr + y * ldin;
    const uint32_t* ptr1 = ptr0 + ldin;
    const uint32_t* ptr2 = ptr1 + ldin;
    const uint32_t* ptr3 = ptr2 + ldin;
    uint32_t* outptr_row_col = outptr_row + y * 8;
    int i = 0;
    for (; i < x_len - 7; i += 8) {
      uint32_t* ptr_out = outptr_row_col;
      asm volatile(
          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
          "vld1.32 {d4-d7}, [%[ptr1]]!        @ load r1, 8 elements\n"
          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"

          "vld1.32 {d0-d3}, [%[ptr2]]!        @ load r2, 8 elements\n"
          "vld1.32 {d4-d7}, [%[ptr3]]!        @ load r3, 8 elements\n"
          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
          : [outptr] "+r"(ptr_out),
            [ptr0] "+r"(ptr0),
            [ptr1] "+r"(ptr1),
            [ptr2] "+r"(ptr2),
            [ptr3] "+r"(ptr3)
          :
          : "q0", "q1", "q2", "q3", "cc", "memory");
      outptr_row_col += stride_out;
    }
    if (right_remain > 0) {
      uint32_t* ptr_out = outptr_row_col;
      asm volatile(
          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
          "vld1.32 {d4-d7}, [%[ptr1]]!        @ load r1, 8 elements\n"
          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
          //"vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   q3, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"

          "vld1.32 {d0-d3}, [%[ptr2]]!        @ load r2, 8 elements\n"
          "vld1.32 {d4-d7}, [%[ptr3]]!        @ load r3, 8 elements\n"
          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
          //"vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   q3, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
          : [outptr] "+r"(ptr_out),
            [ptr0] "+r"(ptr0),
            [ptr1] "+r"(ptr1),
            [ptr2] "+r"(ptr2),
            [ptr3] "+r"(ptr3)
          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
          : "q0", "q1", "q2", "q3", "cc", "memory");
    }
  }
#pragma omp parallel for
  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
    const uint32_t* ptr0 = inptr + y * ldin;
    uint32_t* outptr_row_col = outptr_row + y * 8;
    int i = 0;
    for (; i < x_len - 7; i += 8) {
      uint32_t* ptr_out = outptr_row_col;
      asm volatile(
          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
          :
          : "q0", "q1", "cc", "memory");
      outptr_row_col += stride_out;
    }
    if (right_remain > 0) {
      uint32_t* ptr_out = outptr_row_col;
      asm volatile(
          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
          : "q0", "q1", "cc", "memory");
    }
  }
}

void loadb_trans(
    float* out, const float* in, int ldin, int k0, int kmax, int n0, int nmax) {
  int x_len = kmax - k0;
  uint32_t zerobuff[x_len];  // NOLINT
  memset(zerobuff, 0, sizeof(uint32_t) * x_len);

  auto outptr = reinterpret_cast<uint32_t*>(out);
  auto inptr = reinterpret_cast<const uint32_t*>(in);
  //! data B is not transposed, transpose B to k * 8
  for (int y = n0; y < nmax; y += 8) {
    const uint32_t* inptr0 = inptr + y * ldin + k0;
    const uint32_t* inptr1 = inptr0 + ldin;
    const uint32_t* inptr2 = inptr1 + ldin;
    const uint32_t* inptr3 = inptr2 + ldin;
    const uint32_t* inptr4 = inptr3 + ldin;
    const uint32_t* inptr5 = inptr4 + ldin;
    const uint32_t* inptr6 = inptr5 + ldin;
    const uint32_t* inptr7 = inptr6 + ldin;

    int x = x_len;

    //! cope with row index exceed real size, set to zero buffer
    if ((y + 7) >= nmax) {
      switch ((y + 7) - nmax) {
        case 6:
          inptr1 = zerobuff;
        case 5:
          inptr2 = zerobuff;
        case 4:
          inptr3 = zerobuff;
        case 3:
          inptr4 = zerobuff;
        case 2:
          inptr5 = zerobuff;
        case 1:
          inptr6 = zerobuff;
        case 0:
          inptr7 = zerobuff;
        default:
          break;
      }
    }

    for (; x > 7; x -= 8) {
      //! zip load 8 elements (2 neon Q registers) from each of 8 rows
      asm volatile(
          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
          "q2=r04,r14,r05,r15\n"
          "vst1.32  {d0},    [%[outptr]]!   @ write d0(q0,low),r00,r10\n"

          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
          "q6=r24,r34,r25,r35\n"
          "vst1.32  {d8},    [%[outptr]]!   @ write d8(q4,low),r20,r30\n"

          "vld4.32  {d16-d19}, [%[inptr4]]! @ zip load r4, "
          "q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n"
          "vld4.32  {d20-d23}, [%[inptr5]]! @ zip load r5, "
          "q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n"
          "vtrn.32  q8, q10                 @ trans data: q8=r40,r50,r41,r51; "
          "q10=r44,r54,r45,r55\n"
          "vst1.32  {d16},    [%[outptr]]!  @ write d16(q8,low),r40,r50\n"

          "vld4.32  {d24-d27}, [%[inptr6]]! @ zip load r6, "
          "q12,q13=r60,r64,r61,r65,r62,r66,r63,r67\n"
          "vld4.32  {d28-d31}, [%[inptr7]]! @ zip load r7, "
          "q14,q15=r70,r74,r71,r75,r72,r76,r73,r77\n"
          "vtrn.32  q12, q14                @ trans data:q12=r60,r70,r61,r71; "
          "q14=r64,r74,r65,r75\n"
          "vst1.32  {d24},    [%[outptr]]!  @ write d24(q8,low),r60,r70\n"

          //"pld      [%[inptr0], #128]       @ preload r0 data to cache, fill
          // pipeline\n"
          "vst1.32  {d1},     [%[outptr]]!  @ write d1(q0,high),r01,r11\n"
          "vst1.32  {d9},     [%[outptr]]!  @ write d9(q4,high),r21,r31\n"
          "vst1.32  {d17},    [%[outptr]]!  @ write d17(q8,high),r41,r51\n"
          "vst1.32  {d25},    [%[outptr]]!  @ write d25(q12,high),r61,r71\n"

          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
          "q3=r06,r16,r07,r17\n"
          "vst1.32  {d2},     [%[outptr]]!  @ write d2(q1,low),r02,r12\n"
          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
          "q7=r26,r36,r27,r37\n"
          "vst1.32  {d10},    [%[outptr]]!  @ write d10(q5,low),r22,r32\n"
          "vtrn.32  q9, q11                 @ trans data: q9=r42,r52,r43,r53; "
          "q11=r46,r56,r47,r57\n"
          "vst1.32  {d18},    [%[outptr]]!  @ write d18(q9,low),r42,r52\n"
          "vtrn.32  q13, q15                @ trans data:q13=r62,r72,r63,r73; "
          "q15=r66,r76,r67,r77\n"
          "vst1.32  {d26},    [%[outptr]]!  @ write d18(q9,low),r62,r72\n"

          //"pld      [%[inptr1], #128]       @ preload r1 data to cache, fill
          // pipeline\n"
          "vst1.32  {d3},     [%[outptr]]!  @ write d3(q1,high),r03,r13\n"
          "vst1.32  {d11},    [%[outptr]]!  @ write d11(q5,high),r23,r33\n"
          "vst1.32  {d19},    [%[outptr]]!  @ write d19(q9,high),r43,r53\n"
          "vst1.32  {d27},    [%[outptr]]!  @ write d27(q13,high),r63,r73\n"

          //"pld      [%[inptr2], #128]       @ preload r2 data to cache, fill
          // pipeline\n"
          "vst1.32  {d4},     [%[outptr]]!  @ write d4(q2,low),r04,r14\n"
          "vst1.32  {d12},    [%[outptr]]!  @ write d12(q6,low),r24,r34\n"
          "vst1.32  {d20},    [%[outptr]]!  @ write d20(q10,low),r44,r54\n"
          "vst1.32  {d28},    [%[outptr]]!  @ write d28(q14,low),r64,r74\n"

          //"pld      [%[inptr3], #128]       @ preload r3 data to cache, fill
          // pipeline\n"
          "vst1.32  {d5},     [%[outptr]]!  @ write d5(q2,high),r05,r15\n"
          "vst1.32  {d13},    [%[outptr]]!  @ write d13(q6,high),r25,r35\n"
          "vst1.32  {d21},    [%[outptr]]!  @ write d21(q10,high),r45,r55\n"
          "vst1.32  {d29},    [%[outptr]]!  @ write d29(q14,high),r65,r75\n"

          //"pld      [%[inptr4], #128]       @ preload r4 data to cache, fill
          // pipeline\n"
          "vst1.32  {d6},     [%[outptr]]!  @ write d6(q3,low),r06,r16\n"
          "vst1.32  {d14},    [%[outptr]]!  @ write d14(q7,low),r26,r36\n"
          "vst1.32  {d22},    [%[outptr]]!  @ write d22(q11,low),r46,r56\n"
          "vst1.32  {d30},    [%[outptr]]!  @ write d30(q15,low),r66,r76\n"

          //"pld      [%[inptr5], #128]       @ preload r5 data to cache, fill
          // pipeline\n"
          "vst1.32  {d7},     [%[outptr]]!  @ write d7(q3,high),r07,r17\n"
          "vst1.32  {d15},    [%[outptr]]!  @ write d15(q7,high),r27,r37\n"
          "vst1.32  {d23},    [%[outptr]]!  @ write d23(q11,high),r47,r57\n"
          "vst1.32  {d31},    [%[outptr]]!  @ write d31(q15,high),r67,r77\n"
          : [inptr0] "+r"(inptr0),
            [inptr1] "+r"(inptr1),
            [inptr2] "+r"(inptr2),
            [inptr3] "+r"(inptr3),
            [inptr4] "+r"(inptr4),
            [inptr5] "+r"(inptr5),
            [inptr6] "+r"(inptr6),
            [inptr7] "+r"(inptr7),
            [outptr] "+r"(outptr)
          :
          : "q0",
            "q1",
            "q2",
            "q3",
            "q4",
            "q5",
            "q6",
            "q7",
            "q8",
            "q9",
            "q10",
            "q11",
            "q12",
            "q13",
            "q14",
            "q15",
            "cc",
            "memory");
    }

    for (; x > 0; x--) {
      *outptr++ = *inptr0++;
      *outptr++ = *inptr1++;
      *outptr++ = *inptr2++;
      *outptr++ = *inptr3++;
      *outptr++ = *inptr4++;
      *outptr++ = *inptr5++;
      *outptr++ = *inptr6++;
      *outptr++ = *inptr7++;
    }
  }
}

#endif  // __aarch64__

#ifdef __aarch64__
void sgemm_prepacked_8x12(bool is_transB,
                          int M,
                          int N,
                          int K,
                          const float *A_packed,
                          const float *B,
                          int ldb,
                          float beta,
                          float *C,
                          int ldc,
                          const float *bias,
                          bool has_bias,
                          const operators::ActivationParam act_param,
                          ARMContext *ctx) {
  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
  auto workspace = ctx->workspace_data<float>();
  int threads = ctx->threads();

  auto act_type = act_param.active_type;
  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
  if (act_param.has_active) {
    if (act_type == lite_api::ActivationType::kRelu) {
      flag_act = 0x01;
    } else if (act_type == lite_api::ActivationType::kRelu6) {
      flag_act = 0x02;
      float local_alpha = act_param.Relu_clipped_coef;
      alpha[0] = local_alpha;
      alpha[1] = local_alpha;
      alpha[2] = local_alpha;
      alpha[3] = local_alpha;
    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
      flag_act = 0x03;
      float local_alpha = act_param.Leaky_relu_alpha;
      alpha[0] = local_alpha;
      alpha[1] = local_alpha;
      alpha[2] = local_alpha;
      alpha[3] = local_alpha;
    }
  }
  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
  int x_block = (l2_cache - (MBLOCK * K)) / (sizeof(float) * (K + MBLOCK));
  x_block /= NBLOCK;
  x_block *= NBLOCK;
  int x_num = (N + (x_block - 1)) / x_block;
  x_block = (N + x_num - 1) / x_num;
  x_block = (x_block + NBLOCK - 1) / NBLOCK;
  x_block *= NBLOCK;
  x_block = x_block < NBLOCK ? NBLOCK : x_block;

  // unroll 2 loop
  int tail_pre = (K & (KBLOCK - 1));
  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;

  bool flag_p_remain = false;
  int remain = 0;

  int has_beta = fabsf(beta) > 1e-8f ? 1 : 0;

  //! apanel is pre_compute outside gemm
  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
    unsigned int xmax = x0 + x_block;
    if (xmax > N) {
      xmax = N;
    }
    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
    if (remain > 0) {
      flag_p_remain = true;
    }
    //! load bpanel
    float *b_pannel = workspace;
    if (is_transB) {
      loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax);
    } else {
      loadb(b_pannel, B, ldb, 0, K, x0, xmax);
    }
#pragma omp parallel for num_threads(threads)
    for (unsigned int y = 0; y < M; y += MBLOCK) {
      unsigned int ymax = y + MBLOCK;
      if (ymax > M) {
        ymax = M;
      }

      float bias_local[8] = {0};
      if (has_bias) {
        bias_local[0] = bias[y];
        bias_local[1] = bias[y + 1];
        bias_local[2] = bias[y + 2];
        bias_local[3] = bias[y + 3];
        bias_local[4] = bias[y + 4];
        bias_local[5] = bias[y + 5];
        bias_local[6] = bias[y + 6];
        bias_local[7] = bias[y + 7];
      }

      float cout0[NBLOCK];
      float cout1[NBLOCK];
      float cout2[NBLOCK];
      float cout3[NBLOCK];
      float cout4[NBLOCK];
      float cout5[NBLOCK];
      float cout6[NBLOCK];
      float cout7[NBLOCK];

      float *c_ptr0 = C + y * ldc + x0;
      float *c_ptr1 = c_ptr0 + ldc;
      float *c_ptr2 = c_ptr1 + ldc;
      float *c_ptr3 = c_ptr2 + ldc;
      float *c_ptr4 = c_ptr3 + ldc;
      float *c_ptr5 = c_ptr4 + ldc;
      float *c_ptr6 = c_ptr5 + ldc;
      float *c_ptr7 = c_ptr6 + ldc;

      float *pout0 = c_ptr0;
      float *pout1 = c_ptr1;
      float *pout2 = c_ptr2;
      float *pout3 = c_ptr3;
      float *pout4 = c_ptr4;
      float *pout5 = c_ptr5;
      float *pout6 = c_ptr6;
      float *pout7 = c_ptr7;

      const float *a_ptr_l = A_packed + y * K;
      const float *b_ptr = b_pannel;
      for (int xb = 0; xb < bblocks; xb++) {
        if ((y + 7) >= ymax) {
          switch ((y + 7) - ymax) {
            case 6:
              c_ptr1 = cout1;
            case 5:
              c_ptr2 = cout2;
            case 4:
              c_ptr3 = cout3;
            case 3:
              c_ptr4 = cout4;
            case 2:
              c_ptr5 = cout5;
            case 1:
              c_ptr6 = cout6;
            case 0:
              c_ptr7 = cout7;
            default:
              break;
          }
        }
        if (flag_p_remain && (xb == bblocks - 1)) {
          pout0 = c_ptr0;
          pout1 = c_ptr1;
          pout2 = c_ptr2;
          pout3 = c_ptr3;
          pout4 = c_ptr4;
          pout5 = c_ptr5;
          pout6 = c_ptr6;
          pout7 = c_ptr7;

          c_ptr0 = cout0;
          c_ptr1 = cout1;
          c_ptr2 = cout2;
          c_ptr3 = cout3;
          c_ptr4 = cout4;
          c_ptr5 = cout5;
          c_ptr6 = cout6;
          c_ptr7 = cout7;
          if (has_beta) {
            for (int i = 0; i < remain; ++i) {
              cout0[i] = pout0[i];
              cout1[i] = pout1[i];
              cout2[i] = pout2[i];
              cout3[i] = pout3[i];
              cout4[i] = pout4[i];
              cout5[i] = pout5[i];
              cout6[i] = pout6[i];
              cout7[i] = pout7[i];
            }
          }
        }
        const float *a_ptr = a_ptr_l;
        int tail = tail_pre;
        int k = k_pre;
        // clang-format off
        asm volatile(
            "prfm   pldl1keep, [%[a_ptr]]\n"       /* preload a*/
            "ldp	q2, q3, [%[bias_ptr]]\n"         /* load bias to q2, q3*/
            "dup	v8.4s,  v2.s[0]\n"               /* out0 = 0 */
            "dup	v9.4s,  v2.s[0]\n"               /* out1 = 0*/
            "dup	v10.4s, v2.s[0]\n"               /* out2 = 0*/
            "prfm   pldl1keep, [%[b_ptr]]\n"       /* preload b*/
            "dup	v11.4s, v2.s[1]\n"               /* out3 = 0*/
            "dup	v12.4s, v2.s[1]\n"               /* out4 = 0*/
            "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/
            "dup	v13.4s, v2.s[1]\n"               /* out5 = 0*/
            "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/
            "dup	v14.4s, v2.s[2]\n"               /* out6 = 0*/
            "prfm   pldl1keep, [%[b_ptr], #128]\n" /* preload b*/
            "dup	v15.4s, v2.s[2]\n"               /* out7 = 0*/
            "prfm   pldl1keep, [%[a_ptr], #128]\n" /* preload a*/
            "dup	v16.4s, v2.s[2]\n"               /* out8 = 0*/
            "prfm   pldl1keep, [%[b_ptr], #192]\n" /* preload b*/
            "dup	v17.4s, v2.s[3]\n"               /* out9 = 0*/
            "prfm   pldl1keep, [%[b_ptr], #256]\n" /* preload b*/
            "dup	v18.4s, v2.s[3]\n"               /* out10 = 0*/
            "prfm   pldl1keep, [%[a_ptr], #192]\n" /* preload a*/
            "dup	v19.4s, v2.s[3]\n"               /* out11 = 0*/
            "prfm   pldl1keep, [%[b_ptr], #320]\n" /* preload b*/
            "dup	v20.4s, v3.s[0]\n"               /* out12 = 0*/
            "prfm   pldl1keep, [%[a_ptr], #256]\n" /* preload a*/
            "dup	v21.4s, v3.s[0]\n"               /* out13 = 0*/
            "prfm   pldl1keep, [%[b_ptr], #384]\n" /* preload b*/
            "dup	v22.4s, v3.s[0]\n"               /* out14 = 0*/
            "dup	v23.4s, v3.s[1]\n"               /* out15 = 0*/
            "dup	v24.4s, v3.s[1]\n"               /* out16 = 0*/
            "dup	v25.4s, v3.s[1]\n"               /* out17 = 0*/
            "dup	v26.4s, v3.s[2]\n"               /* out18 = 0*/
            "dup	v27.4s, v3.s[2]\n"               /* out19 = 0*/
            "dup	v28.4s, v3.s[2]\n"               /* out20 = 0*/
            "dup	v29.4s, v3.s[3]\n"               /* out21 = 0*/
            "dup	v30.4s, v3.s[3]\n"               /* out22 = 0*/
            "dup	v31.4s, v3.s[3]\n"               /* out23 = 0*/
            "cbz    %w[has_beta], 0f\n"            /* check beta == 0? */
            /* process beta */
            "dup    v7.4s, %w[beta]\n"                    /* beta to vector */
            "ld1    {v0.4s, v1.4s, v2.4s}, [%[c_ptr0]]\n" /* load output r0 */
            "ld1    {v3.4s, v4.4s, v5.4s}, [%[c_ptr1]]\n" /* load output r1 */
            "fmla   v8.4s, v0.4s, v7.4s\n"  /* cr00 += beta * c_r00*/
            "fmla   v9.4s, v1.4s, v7.4s\n"  /* cr01 += beta * c_r01*/
            "fmla   v10.4s, v2.4s, v7.4s\n" /* cr02 += beta * c_r02*/
            "ld1    {v0.4s, v1.4s, v2.4s}, [%[c_ptr2]]\n" /* load output r2*/
            "fmla   v11.4s, v3.4s, v7.4s\n" /* cr10 += beta * c_r10*/
            "fmla   v12.4s, v4.4s, v7.4s\n" /* cr11 += beta * c_r11*/
            "fmla   v13.4s, v5.4s, v7.4s\n" /* cr12 += beta * c_r12*/
            "ld1    {v3.4s, v4.4s, v5.4s}, [%[c_ptr3]]\n" /* load output r3*/
            "fmla   v14.4s, v0.4s, v7.4s\n" /* cr20 += beta * c_r20*/
            "fmla   v15.4s, v1.4s, v7.4s\n" /* cr21 += beta * c_r21*/
            "fmla   v16.4s, v2.4s, v7.4s\n" /* cr22 += beta * c_r22*/
            "ld1    {v0.4s, v1.4s, v2.4s}, [%[c_ptr4]]\n" /* load output r4*/
            "fmla   v17.4s, v3.4s, v7.4s\n" /* cr30 += beta * c_r30*/
            "fmla   v18.4s, v4.4s, v7.4s\n" /* cr31 += beta * c_r31*/
            "fmla   v19.4s, v5.4s, v7.4s\n" /* cr32 += beta * c_r32*/
            "ld1    {v3.4s, v4.4s, v5.4s}, [%[c_ptr5]]\n" /* load output r5*/
            "fmla   v20.4s, v0.4s, v7.4s\n" /* cr40 += beta * c_r40*/
            "fmla   v21.4s, v1.4s, v7.4s\n" /* cr41 += beta * c_r41*/
            "fmla   v22.4s, v2.4s, v7.4s\n" /* cr42 += beta * c_r42*/
            "ld1    {v0.4s, v1.4s, v2.4s}, [%[c_ptr6]]\n" /* load output r6*/
            "fmla   v23.4s, v3.4s, v7.4s\n" /* cr50 += beta * c_r50*/
            "fmla   v24.4s, v4.4s, v7.4s\n" /* cr51 += beta * c_r51*/
            "fmla   v25.4s, v5.4s, v7.4s\n" /* cr52 += beta * c_r52*/
            "ld1    {v3.4s, v4.4s, v5.4s}, [%[c_ptr7]]\n" /* load output r7*/
            "fmla   v26.4s, v0.4s, v7.4s\n"  /* cr60 += beta * c_r60*/
            "fmla   v27.4s, v1.4s, v7.4s\n"  /* cr61 += beta * c_r61*/
            "fmla   v28.4s, v2.4s, v7.4s\n"  /* cr62 += beta * c_r62*/
            "fmla   v29.4s, v3.4s, v7.4s\n"  /* cr70 += beta * c_r70*/
            "fmla   v30.4s, v4.4s, v7.4s\n"  /* cr71 += beta * c_r71*/
            "fmla   v31.4s, v5.4s, v7.4s\n"  /* cr72 += beta * c_r72*/
            "0: \n"                          /* check loop count */
            "ldp	q0, q1, [%[a_ptr]], #32\n" /* load a00,a01 to q0, q1*/
            "ldp	q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/
            "cbz	%w[k], 2f\n"               /* check loop count > 0 */
            /* main loop */
            /* unrool 0*/
            "1:\n"                              /* main loop */
            "fmla 	v8.4s ,  v4.4s,  v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q4 */
            "fmla  	v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4
                                                    */
            "ldp	q6, q7, [%[b_ptr]], #32\n"     /* load b2, b0 to q6, q7 */
            "fmla	v14.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
            "fmla	v17.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */
            "ldp	q2, q3, [%[a_ptr]], #32\n"    /* load a10, a11 to q3, q4 */
            "fmla 	v20.4s,  v4.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 =q4 */
            "fmla	v23.4s,  v4.4s,  v1.s[1]\n"   /* out5 = b0 * a01[1], b0 =q4 */
            "fmla	v26.4s,  v4.4s,  v1.s[2]\n"   /* out6 = b0 * a01[2], b0 =q4 */
            "fmla	v29.4s,  v4.4s,  v1.s[3]\n"   /* out7 = b0 * a01[3], b0 =q4 */

            "fmla	v9.4s,  v5.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 =q5 */
            "fmla	v12.4s,  v5.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 =q5 */
            "fmla	v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q5*/
            "fmla	v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q5*/
            "fmla	v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q5*/
            "fmla	v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q5*/
            "fmla	v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q5*/
            "fmla	v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q5*/

            "ldp	q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5 */

            "fmla	v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q6*/
            "fmla	v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q6*/
            "prfm   pldl1keep, [%[b_ptr], #384]\n"
            "fmla	v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q6*/
            "fmla	v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q6*/
            "fmla	v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q6*/
            "fmla	v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q6*/
            "fmla	v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q6*/
            "fmla	v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q6*/

            "ldp	q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1 */

            /* unrool 1 */
            "fmla 	v8.4s ,  v7.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q7 */
            "fmla	v11.4s ,  v7.4s,  v2.s[1]\n"  /* out1 = b0 * a10[1], b0 =q7 */
            "fmla	v14.4s,  v7.4s,  v2.s[2]\n"   /* out2 = b0 * a10[2], b0 =q7 */
            "prfm   pldl1keep, [%[a_ptr], #256]\n"
            "fmla	v17.4s,  v7.4s,  v2.s[3]\n"   /* out3 = b0 * a10[3], b0 =q7 */
            "fmla 	v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q7 */
            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q7*/
            "fmla	v26.4s,  v7.4s,  v3.s[2]\n"   /* out6 = b0 * a11[2], b0 =q7 */
            "fmla	v29.4s,  v7.4s,  v3.s[3]\n"   /* out7 = b0 * a11[3], b0 =q7 */

            "ldp	q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7 */

            "fmla	v9.4s,  v4.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 =q4 */
            "fmla	v12.4s,  v4.4s,  v2.s[1]\n" /* out9 = b0 * a10[1], b1 =q4 */
            "fmla	v15.4s,  v4.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =q4*/
            "fmla	v18.4s,  v4.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =q4*/
            "fmla	v21.4s,  v4.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =q4*/
            "fmla	v24.4s,  v4.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =q4*/
            "fmla	v27.4s,  v4.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =q4*/
            "fmla	v30.4s,  v4.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =q4*/

            "fmla	v10.4s,  v5.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =q5*/
            "fmla	v13.4s,  v5.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =q5*/
            "fmla	v16.4s,  v5.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =q5*/
            "fmla	v19.4s,  v5.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =q5*/
            "fmla	v22.4s,  v5.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =q5*/
            "fmla	v25.4s,  v5.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =q5*/
            "fmla	v28.4s,  v5.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =q5*/
            "fmla	v31.4s,  v5.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =q5*/
            "ldp	q4, q5, [%[b_ptr]], #32\n"  /* load b2, b0 to q4, q5 */
            /* unrool 2*/
            "fmla 	v8.4s ,  v6.4s,  v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q6 */
            "fmla  	v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q6
                                                    */
            "ldp	q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4*/
            "fmla	v14.4s,  v6.4s,  v0.s[2]\n"    /* out2 = b0 * a00[2], b0 =q6*/
            "fmla	v17.4s,  v6.4s,  v0.s[3]\n"    /* out3 = b0 * a00[3], b0 =q6*/
            "fmla 	v20.4s,  v6.4s,  v1.s[0]\n"  /* out4 = b0 * a01[0], b0 =q6*/
            "fmla	v23.4s,  v6.4s,  v1.s[1]\n"    /* out5 = b0 * a01[1], b0 =q6*/
            "fmla	v26.4s,  v6.4s,  v1.s[2]\n"    /* out6 = b0 * a01[2], b0 =q6*/
            "fmla	v29.4s,  v6.4s,  v1.s[3]\n"    /* out7 = b0 * a01[3], b0 =q6*/
            "fmla	v9.4s,  v7.4s,  v0.s[0]\n"     /* out8 = b1 * a00[0], b1 =q7*/
            "fmla	v12.4s,  v7.4s,  v0.s[1]\n"    /* out9 = b1 * a00[1], b1 =q7*/
            "prfm   pldl1keep, [%[b_ptr], #384]\n"
            "fmla	v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q7*/
            "fmla	v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q7*/
            "fmla	v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q7*/
            "fmla	v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q7*/
            "fmla	v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q7*/
            "fmla	v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q7*/

            "ldp	q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/

            "fmla	v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q4*/
            "fmla	v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q4*/
            "fmla	v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q4*/
            "fmla	v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q4*/
            "fmla	v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q4*/
            "fmla	v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q4*/
            "fmla	v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q4*/
            "fmla	v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q4*/
            "ldp	q0, q1, [%[a_ptr]], #32\n"  /* load a00, a01 to q0, q1*/
            /* unrool 3*/
            "fmla 	v8.4s ,  v5.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/
            "fmla	v11.4s ,  v5.4s,  v2.s[1]\n"  /* out1 = b0 * a10[1], b0 =q5*/
            "fmla	v14.4s,  v5.4s,  v2.s[2]\n"   /* out2 = b0 * a10[2], b0 =q5*/
            "fmla	v17.4s,  v5.4s,  v2.s[3]\n"   /* out3 = b0 * a10[3], b0 =q5*/
            "fmla 	v20.4s,  v5.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/
            "fmla   v23.4s,  v5.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 =q5*/
            "fmla	v26.4s,  v5.4s,  v3.s[2]\n"   /* out6 = b0 * a11[2], b0 =q5*/
            "fmla	v29.4s,  v5.4s,  v3.s[3]\n"   /* out7 = b0 * a11[3], b0 =q5*/
            "ldp	q4, q5, [%[b_ptr]], #32\n"    /* load b0, b1 to q4, q5*/
            "fmla	v9.4s,  v6.4s,  v2.s[0]\n"    /* out8 = b0 * a10[0], b1 =q6*/
            "fmla	v12.4s,  v6.4s,  v2.s[1]\n"   /* out9 = b0 * a10[1], b1 =q6*/
            "prfm   pldl1keep, [%[a_ptr], #256]\n"
            "fmla	v15.4s,  v6.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =q6*/
            "fmla	v18.4s,  v6.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =q6*/
            "fmla	v21.4s,  v6.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =q6*/
            "fmla	v24.4s,  v6.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =q6*/
            "fmla	v27.4s,  v6.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =q6*/
            "prfm   pldl1keep, [%[b_ptr], #384]\n"
            "fmla	v30.4s,  v6.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =q6*/
            "fmla	v10.4s,  v7.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =q7*/
            "fmla	v13.4s,  v7.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =q7*/
            "fmla	v16.4s,  v7.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =q7*/
            "fmla	v19.4s,  v7.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =q7*/
            "fmla	v22.4s,  v7.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =q7*/
            "fmla	v25.4s,  v7.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =q7*/
            "subs	%w[k], %w[k], #1\n"         /* loop count - 1*/
            "fmla	v28.4s,  v7.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =q7*/
            "fmla	v31.4s,  v7.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =q7*/
            "bne	1b\n"
            "2:\n"                            /* process tail*/
            "subs		%w[tail], %w[tail], #1\n" /* tail--*/
            "beq		3f\n"                     /*jump to tail = 1*/
            /* final unrool 0*/
            /* unrool 0, tail > 1*/
            "fmla 	v8.4s ,  v4.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 =q4*/
            "fmla  	v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4*/
            "ldp	q6, q7, [%[b_ptr]], #32\n"     /* load b2, b0 to q6, q7*/
            "fmla	v14.4s,  v4.4s,  v0.s[2]\n"    /* out2 = b0 * a00[2], b0 =q4*/
            "fmla	v17.4s,  v4.4s,  v0.s[3]\n"    /* out3 = b0 * a00[3], b0 =q4*/
            "ldp	q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q2, q3*/
            "fmla 	v20.4s,  v4.4s,  v1.s[0]\n"  /* out4 = b0 * a01[0], b0 =q4*/
            "fmla	v23.4s,  v4.4s,  v1.s[1]\n"    /* out5 = b0 * a01[1], b0 =q4*/
            "fmla	v26.4s,  v4.4s,  v1.s[2]\n"    /* out6 = b0 * a01[2], b0 =q4*/
            "fmla	v29.4s,  v4.4s,  v1.s[3]\n"    /* out7 = b0 * a01[3], b0 =q4*/
            "subs	%w[tail], %w[tail], #1\n"      /* tail--*/
            "fmla	v9.4s,  v5.4s,  v0.s[0]\n"     /* out8 = b1 * a00[0], b1 =q5*/
            "fmla	v12.4s,  v5.4s,  v0.s[1]\n"    /* out9 = b1 * a00[1], b1 =q5*/
            "fmla	v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q5*/
            "fmla	v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q5*/
            "fmla	v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q5*/
            "fmla	v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q5*/
            "fmla	v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q5*/
            "fmla	v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q5*/
            "ldp	q4, q5, [%[b_ptr]], #32\n"  /* load b1, b2 to q4, q5*/
            "fmla	v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q6*/
            "fmla	v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q6*/
            "fmla	v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q6*/
            "fmla	v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q6*/
            "fmla	v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q6*/
            "fmla	v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q6*/
            "fmla	v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q6*/
            "fmla	v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q6*/
            "beq		4f\n"                     /*jump to tail = 2*/
            /* unrool 1, tail > 2*/
            "ldp	q0, q1, [%[a_ptr]], #32\n"    /* load a00, a01 to q0, q1*/
            "fmla 	v8.4s ,  v7.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q7*/
            "fmla	v11.4s ,  v7.4s,  v2.s[1]\n"  /* out1 = b0 * a10[1], b0 =q7*/
            "fmla	v14.4s,  v7.4s,  v2.s[2]\n"   /* out2 = b0 * a10[2], b0 =q7*/
            "fmla	v17.4s,  v7.4s,  v2.s[3]\n"   /* out3 = b0 * a10[3], b0 =q7*/
            "fmla 	v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q7*/
            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 =q7*/
            "fmla	v26.4s,  v7.4s,  v3.s[2]\n"   /* out6 = b0 * a11[2], b0 =q7*/
            "fmla	v29.4s,  v7.4s,  v3.s[3]\n"   /* out7 = b0 * a11[3], b0 =q7*/
            "ldp	q6, q7, [%[b_ptr]], #32\n"    /* load b0, b1 to q6, q7*/
            "fmla	v9.4s,  v4.4s,  v2.s[0]\n"    /* out8 = b0 * a10[0], b1 =q4*/
            "fmla	v12.4s,  v4.4s,  v2.s[1]\n"   /* out9 = b0 * a10[1], b1 =q4*/
            "fmla	v15.4s,  v4.4s,  v2.s[2]\n"   /* out10 = b1 * a10[2], b1 =q4*/
            "fmla	v18.4s,  v4.4s,  v2.s[3]\n"   /* out11 = b1 * a10[3], b1 =q4*/
            "fmla	v21.4s,  v4.4s,  v3.s[0]\n"   /* out12 = b1 * a10[0], b1 =q4*/
            "fmla	v24.4s,  v4.4s,  v3.s[1]\n"   /* out13 = b1 * a10[1], b1 =q4*/
            "fmla	v27.4s,  v4.4s,  v3.s[2]\n"   /* out14 = b1 * a10[2], b1 =q4*/
            "fmla	v30.4s,  v4.4s,  v3.s[3]\n"   /* out15 = b1 * a10[3], b1 =q4*/
            "subs	%w[tail], %w[tail], #1\n"     /* tail--*/
            "fmla	v10.4s,  v5.4s,  v2.s[0]\n"   /* out16 = b2 * a10[0], b2 =q5*/
            "fmla	v13.4s,  v5.4s,  v2.s[1]\n"   /* out17 = b2 * a10[0], b2 =q5*/
            "fmla	v16.4s,  v5.4s,  v2.s[2]\n"   /* out18 = b2 * a10[0], b2 =q5*/
            "fmla	v19.4s,  v5.4s,  v2.s[3]\n"   /* out19 = b2 * a10[0], b2 =q5*/
            "fmla	v22.4s,  v5.4s,  v3.s[0]\n"   /* out20 = b2 * a10[0], b2 =q5*/
            "fmla	v25.4s,  v5.4s,  v3.s[1]\n"   /* out21 = b2 * a10[0], b2 =q5*/
            "fmla	v28.4s,  v5.4s,  v3.s[2]\n"   /* out22 = b2 * a10[0], b2 =q5*/
            "fmla	v31.4s,  v5.4s,  v3.s[3]\n"   /* out23 = b2 * a10[0], b2 =q5*/
            "beq		5f\n"                       /*jump to tail = 3*/
            /* unrool 2, tail = 4*/
            "ldp	q4, q5, [%[b_ptr]], #32\n"     /* load b2, b0 to q4, q5*/
            "fmla 	v8.4s ,  v6.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 =q6*/
            "fmla  	v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q6*/
            "ldp	q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4*/
            "fmla	v14.4s,  v6.4s,  v0.s[2]\n"    /* out2 = b0 * a00[2], b0 =q6*/
            "fmla	v17.4s,  v6.4s,  v0.s[3]\n"    /* out3 = b0 * a00[3], b0 =q6*/
            "fmla 	v20.4s,  v6.4s,  v1.s[0]\n"  /* out4 = b0 * a01[0], b0 =q6*/
            "fmla	v23.4s,  v6.4s,  v1.s[1]\n"    /* out5 = b0 * a01[1], b0 =q6*/
            "fmla	v26.4s,  v6.4s,  v1.s[2]\n"    /* out6 = b0 * a01[2], b0 =q6*/
            "fmla	v29.4s,  v6.4s,  v1.s[3]\n"    /* out7 = b0 * a01[3], b0 =q6*/
            "fmla	v9.4s,  v7.4s,  v0.s[0]\n"     /* out8 = b1 * a00[0], b1 =q7*/
            "fmla	v12.4s,  v7.4s,  v0.s[1]\n"    /* out9 = b1 * a00[1], b1 =q7*/
            "fmla	v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q7*/
            "fmla	v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q7*/
            "fmla	v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q7*/
            "fmla	v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q7*/
            "fmla	v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q7*/
            "fmla	v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q7*/
            "ldp	q6, q7, [%[b_ptr]], #32\n"  /* load b1, b2 to q6, q7*/
            "fmla	v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q4*/
            "fmla	v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q4*/
            "fmla	v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q4*/
            "fmla	v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q4*/
            "fmla	v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q4*/
            "fmla	v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q4*/
            "fmla	v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q4*/
            "fmla	v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q4*/
            /* unrool 3, tail = 4*/
            "fmla 	v8.4s ,  v5.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/
            "fmla	v11.4s ,  v5.4s,  v2.s[1]\n"  /* out1 = b0 * a10[1], b0 =q5*/
            "fmla	v14.4s,  v5.4s,  v2.s[2]\n"   /* out2 = b0 * a10[2], b0 =q5*/
            "fmla	v17.4s,  v5.4s,  v2.s[3]\n"   /* out3 = b0 * a10[3], b0 =q5*/
            "fmla 	v20.4s,  v5.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/
            "fmla   v23.4s,  v5.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 =q5*/
            "fmla	v26.4s,  v5.4s,  v3.s[2]\n"   /* out6 = b0 * a11[2], b0 =q5*/
            "fmla	v29.4s,  v5.4s,  v3.s[3]\n"   /* out7 = b0 * a11[3], b0 =q5*/
            "fmla	v9.4s,  v6.4s,  v2.s[0]\n"    /* out8 = b0 * a10[0], b1 =q6*/
            "fmla	v12.4s,  v6.4s,  v2.s[1]\n"   /* out9 = b1 * a10[1], b1 =q6*/
            "fmla	v15.4s,  v6.4s,  v2.s[2]\n"   /* out10 = b1 * a10[2], b1 =q6*/
            "fmla	v18.4s,  v6.4s,  v2.s[3]\n"   /* out11 = b1 * a10[3], b1 =q6*/
            "fmla	v21.4s,  v6.4s,  v3.s[0]\n"   /* out12 = b1 * a10[0], b1 =q6*/
            "fmla	v24.4s,  v6.4s,  v3.s[1]\n"   /* out13 = b1 * a10[1], b1 =q6*/
            "fmla	v27.4s,  v6.4s,  v3.s[2]\n"   /* out14 = b1 * a10[2], b1 =q6*/
            "fmla	v30.4s,  v6.4s,  v3.s[3]\n"   /* out15 = b1 * a10[3], b1 =q6*/
            "fmla	v10.4s,  v7.4s,  v2.s[0]\n"   /* out16 = b2 * a10[0], b2 =q7*/
            "fmla	v13.4s,  v7.4s,  v2.s[1]\n"   /* out17 = b2 * a10[0], b2 =q7*/
            "fmla	v16.4s,  v7.4s,  v2.s[2]\n"   /* out18 = b2 * a10[0], b2 =q7*/
            "fmla	v19.4s,  v7.4s,  v2.s[3]\n"   /* out19 = b2 * a10[0], b2 =q7*/
            "fmla	v22.4s,  v7.4s,  v3.s[0]\n"   /* out20 = b2 * a10[0], b2 =q7*/
            "fmla	v25.4s,  v7.4s,  v3.s[1]\n"   /* out21 = b2 * a10[0], b2 =q7*/
            "fmla	v28.4s,  v7.4s,  v3.s[2]\n"   /* out22 = b2 * a10[0], b2 =q7*/
            "fmla	v31.4s,  v7.4s,  v3.s[3]\n"   /* out23 = b2 * a10[0], b2 =q7*/
            "b		11f\n"
            /* tails==1 final tail*/
            "3: \n"                            /* tail=1*/
            "ldr	q6, [%[b_ptr]], #16\n"       /* load b2 to q6*/
            "fmla v8.4s ,  v4.4s,  v0.s[0]\n"  /* out0 = b0 * a10[0], b0 =q5*/
            "fmla	v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a10[1], b0 =q5*/
            "fmla	v14.4s,  v4.4s,  v0.s[2]\n"  /* out2 = b0 * a10[2], b0 =q5*/
            "fmla	v17.4s,  v4.4s,  v0.s[3]\n"  /* out3 = b0 * a10[3], b0 =q5*/
            "fmla v20.4s,  v4.4s,  v1.s[0]\n"  /* out4 = b0 * a11[0], b0 =q5*/
            "fmla v23.4s,  v4.4s,  v1.s[1]\n"  /* out5 = b0 * a11[1], b0 = q5*/
            "fmla	v26.4s,  v4.4s,  v1.s[2]\n"  /* out6 = b0 * a11[2], b0 =q5*/
            "fmla	v29.4s,  v4.4s,  v1.s[3]\n"  /* out7 = b0 * a11[3], b0 =q5*/
            "fmla	v9.4s,  v5.4s,  v0.s[0]\n"   /* out8 = b0 * a10[0], b1 =q6*/
            "fmla	v12.4s,  v5.4s,  v0.s[1]\n"  /* out9 = b1 * a10[1], b1 =q6*/
            "fmla	v15.4s,  v5.4s,  v0.s[2]\n"  /* out10 = b1 * a10[2], b1 =q6*/
            "fmla	v18.4s,  v5.4s,  v0.s[3]\n"  /* out11 = b1 * a10[3], b1 =q6*/
            "fmla	v21.4s,  v5.4s,  v1.s[0]\n"  /* out12 = b1 * a10[0], b1 =q6*/
            "fmla	v24.4s,  v5.4s,  v1.s[1]\n"  /* out13 = b1 * a10[1], b1 =q6*/
            "fmla	v27.4s,  v5.4s,  v1.s[2]\n"  /* out14 = b1 * a10[2], b1 =q6*/
            "fmla	v30.4s,  v5.4s,  v1.s[3]\n"  /* out15 = b1 * a10[3], b1 =q6*/
            "fmla	v10.4s,  v6.4s,  v0.s[0]\n"  /* out16 = b2 * a10[0], b2 =q7*/
            "fmla	v13.4s,  v6.4s,  v0.s[1]\n"  /* out17 = b2 * a10[0], b2 =q7*/
            "fmla	v16.4s,  v6.4s,  v0.s[2]\n"  /* out18 = b2 * a10[0], b2 =q7*/
            "fmla	v19.4s,  v6.4s,  v0.s[3]\n"  /* out19 = b2 * a10[0], b2 =q7*/
            "fmla	v22.4s,  v6.4s,  v1.s[0]\n"  /* out20 = b2 * a10[0], b2 =q7*/
            "fmla	v25.4s,  v6.4s,  v1.s[1]\n"  /* out21 = b2 * a10[0], b2 =q7*/
            "fmla	v28.4s,  v6.4s,  v1.s[2]\n"  /* out22 = b2 * a10[0], b2 =q7*/
            "fmla	v31.4s,  v6.4s,  v1.s[3]\n"  /* out23 = b2 * a10[0], b2 =q7*/
            "b		11f\n"
            /* tails==2 final tail*/
            "4:\n"                              /* tail = 2*/
            "fmla 	v8.4s ,  v7.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/
            "fmla	v11.4s ,  v7.4s,  v2.s[1]\n"  /* out1 = b0 * a10[1], b0 =q5*/
            "fmla	v14.4s,  v7.4s,  v2.s[2]\n"   /* out2 = b0 * a10[2], b0 =q5*/
            "fmla	v17.4s,  v7.4s,  v2.s[3]\n"   /* out3 = b0 * a10[3], b0 =q5*/
            "fmla 	v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/
            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
            "fmla	v26.4s,  v7.4s,  v3.s[2]\n"   /* out6 = b0 * a11[2], b0 =q5*/
            "fmla	v29.4s,  v7.4s,  v3.s[3]\n"   /* out7 = b0 * a11[3], b0 =q5*/
            "fmla	v9.4s,  v4.4s,  v2.s[0]\n"    /* out8 = b0 * a10[0], b1 =q6*/
            "fmla	v12.4s,  v4.4s,  v2.s[1]\n"   /* out9 = b1 * a10[1], b1 =q6*/
            "fmla	v15.4s,  v4.4s,  v2.s[2]\n"   /* out10 = b1 * a10[2], b1 =q6*/
            "fmla	v18.4s,  v4.4s,  v2.s[3]\n"   /* out11 = b1 * a10[3], b1 =q6*/
            "fmla	v21.4s,  v4.4s,  v3.s[0]\n"   /* out12 = b1 * a10[0], b1 =q6*/
            "fmla	v24.4s,  v4.4s,  v3.s[1]\n"   /* out13 = b1 * a10[1], b1 =q6*/
            "fmla	v27.4s,  v4.4s,  v3.s[2]\n"   /* out14 = b1 * a10[2], b1 =q6*/
            "fmla	v30.4s,  v4.4s,  v3.s[3]\n"   /* out15 = b1 * a10[3], b1 =q6*/
            "fmla	v10.4s,  v5.4s,  v2.s[0]\n"   /* out16 = b2 * a10[0], b2 =q7*/
            "fmla	v13.4s,  v5.4s,  v2.s[1]\n"   /* out17 = b2 * a10[0], b2 =q7*/
            "fmla	v16.4s,  v5.4s,  v2.s[2]\n"   /* out18 = b2 * a10[0], b2 =q7*/
            "fmla	v19.4s,  v5.4s,  v2.s[3]\n"   /* out19 = b2 * a10[0], b2 =q7*/
            "fmla	v22.4s,  v5.4s,  v3.s[0]\n"   /* out20 = b2 * a10[0], b2 =q7*/
            "fmla	v25.4s,  v5.4s,  v3.s[1]\n"   /* out21 = b2 * a10[0], b2 =q7*/
            "fmla	v28.4s,  v5.4s,  v3.s[2]\n"   /* out22 = b2 * a10[0], b2 =q7*/
            "fmla	v31.4s,  v5.4s,  v3.s[3]\n"   /* out23 = b2 * a10[0], b2 =q7*/
            "b		11f\n"
            /* tails==3 final tail*/
            "5:\n"                              /* tail = 3*/
            "ldr	q4, [%[b_ptr]], #16\n"        /* load b2, b0 to q4*/
            "fmla 	v8.4s ,  v6.4s,  v0.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/
            "fmla	v11.4s ,  v6.4s,  v0.s[1]\n"  /* out1 = b0 * a10[1], b0 =q5*/
            "fmla	v14.4s,  v6.4s,  v0.s[2]\n"   /* out2 = b0 * a10[2], b0 =q5*/
            "fmla	v17.4s,  v6.4s,  v0.s[3]\n"   /* out3 = b0 * a10[3], b0 =q5*/
            "fmla 	v20.4s,  v6.4s,  v1.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/
            "fmla   v23.4s,  v6.4s,  v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
            "fmla	v26.4s,  v6.4s,  v1.s[2]\n"   /* out6 = b0 * a11[2], b0 =q5*/
            "fmla	v29.4s,  v6.4s,  v1.s[3]\n"   /* out7 = b0 * a11[3], b0 =q5*/
            "fmla	v9.4s,  v7.4s,  v0.s[0]\n"    /* out8 = b0 * a10[0], b1 =q6*/
            "fmla	v12.4s,  v7.4s,  v0.s[1]\n"   /* out9 = b1 * a10[1], b1 =q6*/
            "fmla	v15.4s,  v7.4s,  v0.s[2]\n"   /* out10 = b1 * a10[2], b1 =q6*/
            "fmla	v18.4s,  v7.4s,  v0.s[3]\n"   /* out11 = b1 * a10[3], b1 =q6*/
            "fmla	v21.4s,  v7.4s,  v1.s[0]\n"   /* out12 = b1 * a10[0], b1 =q6*/
            "fmla	v24.4s,  v7.4s,  v1.s[1]\n"   /* out13 = b1 * a10[1], b1 =q6*/
            "fmla	v27.4s,  v7.4s,  v1.s[2]\n"   /* out14 = b1 * a10[2], b1 =q6*/
            "fmla	v30.4s,  v7.4s,  v1.s[3]\n"   /* out15 = b1 * a10[3], b1 =q6*/
            "fmla	v10.4s,  v4.4s,  v0.s[0]\n"   /* out16 = b2 * a10[0], b2 =q7*/
            "fmla	v13.4s,  v4.4s,  v0.s[1]\n"   /* out17 = b2 * a10[0], b2 =q7*/
            "fmla	v16.4s,  v4.4s,  v0.s[2]\n"   /* out18 = b2 * a10[0], b2 =q7*/
            "fmla	v19.4s,  v4.4s,  v0.s[3]\n"   /* out19 = b2 * a10[0], b2 =q7*/
            "fmla	v22.4s,  v4.4s,  v1.s[0]\n"   /* out20 = b2 * a10[0], b2 =q7*/
            "fmla	v25.4s,  v4.4s,  v1.s[1]\n"   /* out21 = b2 * a10[0], b2 =q7*/
            "fmla	v28.4s,  v4.4s,  v1.s[2]\n"   /* out22 = b2 * a10[0], b2 =q7*/
            "fmla	v31.4s,  v4.4s,  v1.s[3]\n"   /* out23 = b2 * a10[0], b2 =q7*/

            "11:                           \n"   /* check activation */
            "cmp    %w[flag_act],   #1     \n"   /* check if has relu */
            "bne    12f                    \n"   /* jump if no relu */
            "movi   v0.4s,  #0             \n"   /* for relu*/
            "fmax   v8.4s,  v8.4s,  v0.4s  \n"   /* relu*/
            "fmax   v9.4s,  v9.4s,  v0.4s  \n"   /* relu*/
            "fmax   v10.4s, v10.4s, v0.4s  \n"   /* relu*/
            "fmax   v11.4s, v11.4s, v0.4s  \n"   /* relu*/
            "fmax   v12.4s, v12.4s, v0.4s  \n"   /* relu*/
            "fmax   v13.4s, v13.4s, v0.4s  \n"   /* relu*/
            "fmax   v14.4s, v14.4s, v0.4s  \n"   /* relu*/
            "fmax   v15.4s, v15.4s, v0.4s  \n"   /* relu*/
            "fmax   v16.4s, v16.4s, v0.4s  \n"   /* relu*/
            "fmax   v17.4s, v17.4s, v0.4s  \n"   /* relu*/
            "fmax   v18.4s, v18.4s, v0.4s  \n"   /* relu*/
            "fmax   v19.4s, v19.4s, v0.4s  \n"   /* relu*/
            "fmax   v20.4s, v20.4s, v0.4s  \n"   /* relu*/
            "fmax   v21.4s, v21.4s, v0.4s  \n"   /* relu*/
            "fmax   v22.4s, v22.4s, v0.4s  \n"   /* relu*/
            "fmax   v23.4s, v23.4s, v0.4s  \n"   /* relu*/
            "fmax   v24.4s, v24.4s, v0.4s  \n"   /* relu*/
            "fmax   v25.4s, v25.4s, v0.4s  \n"   /* relu*/
            "fmax   v26.4s, v26.4s, v0.4s  \n"   /* relu*/
            "fmax   v27.4s, v27.4s, v0.4s  \n"   /* relu*/
            "fmax   v28.4s, v28.4s, v0.4s  \n"   /* relu*/
            "fmax   v29.4s, v29.4s, v0.4s  \n"   /* relu*/
            "fmax   v30.4s, v30.4s, v0.4s  \n"   /* relu*/
            "fmax   v31.4s, v31.4s, v0.4s  \n"   /* relu*/
            "b      20f                    \n"   /* relu end */
            //! no act
            "12:                           \n"   /* no relu */
            "cmp   %w[flag_act],  #0       \n"   /* check no act */
            "beq   20f                     \n"   /* no act end */
            //! relu6
            "cmp    %w[flag_act],  #2      \n"    /* check if has relu6 */
            "bne    13f                    \n"    /* jump if no relu6 */
            "movi   v0.4s, #0              \n"    /* for relu6 */
            "ld1    {v1.4s}, [%[alpha]]    \n"    /* relu6 alpha */
            "fmax   v8.4s,  v8.4s,  v0.4s  \n"    /* relu6 */
            "fmax   v9.4s,  v9.4s,  v0.4s  \n"    /* relu6 */
            "fmax   v10.4s, v10.4s, v0.4s  \n"    /* relu6 */
            "fmax   v11.4s, v11.4s, v0.4s  \n"    /* relu6 */
            "fmax   v12.4s, v12.4s, v0.4s  \n"    /* relu6 */
            "fmax   v13.4s, v13.4s, v0.4s  \n"    /* relu6 */
            "fmax   v14.4s, v14.4s, v0.4s  \n"    /* relu6 */
            "fmax   v15.4s, v15.4s, v0.4s  \n"    /* relu6 */
            "fmax   v16.4s, v16.4s, v0.4s  \n"    /* relu6 */
            "fmax   v17.4s, v17.4s, v0.4s  \n"    /* relu6 */
            "fmax   v18.4s, v18.4s, v0.4s  \n"    /* relu6 */
            "fmax   v19.4s, v19.4s, v0.4s  \n"    /* relu6 */
            "fmax   v20.4s, v20.4s, v0.4s  \n"    /* relu6 */
            "fmax   v21.4s, v21.4s, v0.4s  \n"    /* relu6 */
            "fmax   v22.4s, v22.4s, v0.4s  \n"    /* relu6 */
            "fmax   v23.4s, v23.4s, v0.4s  \n"    /* relu6 */
            "fmax   v24.4s, v24.4s, v0.4s  \n"    /* relu6 */
            "fmax   v25.4s, v25.4s, v0.4s  \n"    /* relu6 */
            "fmax   v26.4s, v26.4s, v0.4s  \n"    /* relu6 */
            "fmax   v27.4s, v27.4s, v0.4s  \n"    /* relu6 */
            "fmax   v28.4s, v28.4s, v0.4s  \n"    /* relu6 */
            "fmax   v29.4s, v29.4s, v0.4s  \n"    /* relu6 */
            "fmax   v30.4s, v30.4s, v0.4s  \n"    /* relu6 */
            "fmax   v31.4s, v31.4s, v0.4s  \n"    /* relu6 */
            "fmin   v8.4s,  v8.4s,  v1.4s  \n"    /* relu6 */
            "fmin   v9.4s,  v9.4s,  v1.4s  \n"    /* relu6 */
            "fmin   v10.4s, v10.4s, v1.4s  \n"    /* relu6 */
            "fmin   v11.4s, v11.4s, v1.4s  \n"    /* relu6 */
            "fmin   v12.4s, v12.4s, v1.4s  \n"    /* relu6 */
            "fmin   v13.4s, v13.4s, v1.4s  \n"    /* relu6 */
            "fmin   v14.4s, v14.4s, v1.4s  \n"    /* relu6 */
            "fmin   v15.4s, v15.4s, v1.4s  \n"    /* relu6 */
            "fmin   v16.4s, v16.4s, v1.4s  \n"    /* relu6 */
            "fmin   v17.4s, v17.4s, v1.4s  \n"    /* relu6 */
            "fmin   v18.4s, v18.4s, v1.4s  \n"    /* relu6 */
            "fmin   v19.4s, v19.4s, v1.4s  \n"    /* relu6 */
            "fmin   v20.4s, v20.4s, v1.4s  \n"    /* relu6 */
            "fmin   v21.4s, v21.4s, v1.4s  \n"    /* relu6 */
            "fmin   v22.4s, v22.4s, v1.4s  \n"    /* relu6 */
            "fmin   v23.4s, v23.4s, v1.4s  \n"    /* relu6 */
            "fmin   v24.4s, v24.4s, v1.4s  \n"    /* relu6 */
            "fmin   v25.4s, v25.4s, v1.4s  \n"    /* relu6 */
            "fmin   v26.4s, v26.4s, v1.4s  \n"    /* relu6 */
            "fmin   v27.4s, v27.4s, v1.4s  \n"    /* relu6 */
            "fmin   v28.4s, v28.4s, v1.4s  \n"    /* relu6 */
            "fmin   v29.4s, v29.4s, v1.4s  \n"    /* relu6 */
            "fmin   v30.4s, v30.4s, v1.4s  \n"    /* relu6 */
            "fmin   v31.4s, v31.4s, v1.4s  \n"    /* relu6 */
            "b      20f                    \n"    /* relu6 end */
            //! leakey relu
            "13:                                \n" /* otherwise is leakey relu */
            "movi   v0.4s,    #0                \n" /* for leakey relu */
            "ld1    {v1.4s},  [%[alpha]]        \n" /* leakey relu alpha */
            "fcmge  v2.4s,    v8.4s,    v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v8.4s,    v1.4s   \n" /* vmulq_f32 */
            "fcmge  v4.4s,    v9.4s,    v0.4s   \n" /* vcgeq_f32 */
            "fmul   v5.4s,    v9.4s,    v1.4s   \n" /* vmulq_f32 */
            "fcmge  v6.4s,    v10.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v7.4s,    v10.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v8.16b,   v3.16b,   v2.16b  \n" /* choose*/
            "bif    v9.16b,   v5.16b,   v4.16b  \n" /* choose*/
            "bif    v10.16b,  v7.16b,   v6.16b  \n" /* choose*/
            "fcmge  v2.4s,    v11.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v11.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v11.16b,  v3.16b,   v2.16b  \n" /* choose*/
            "fcmge  v2.4s,    v12.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v12.4s,   v1.4s   \n" /* vmulq_f32 */
            "fcmge  v4.4s,    v13.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v5.4s,    v13.4s,   v1.4s   \n" /* vmulq_f32 */
            "fcmge  v6.4s,    v14.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v7.4s,    v14.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v12.16b,  v3.16b,   v2.16b  \n" /* choose*/
            "bif    v13.16b,  v5.16b,   v4.16b  \n" /* choose*/
            "bif    v14.16b,  v7.16b,   v6.16b  \n" /* choose*/
            "fcmge  v2.4s,    v15.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v15.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v15.16b,  v3.16b,   v2.16b  \n" /* choose*/
            "fcmge  v2.4s,    v16.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v16.4s,   v1.4s   \n" /* vmulq_f32 */
            "fcmge  v4.4s,    v17.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v5.4s,    v17.4s,   v1.4s   \n" /* vmulq_f32 */
            "fcmge  v6.4s,    v18.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v7.4s,    v18.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v16.16b,  v3.16b,   v2.16b  \n" /* choose*/
            "bif    v17.16b,  v5.16b,   v4.16b  \n" /* choose*/
            "bif    v18.16b,  v7.16b,   v6.16b  \n" /* choose*/
            "fcmge  v2.4s,    v19.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v19.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v19.16b,  v3.16b,   v2.16b  \n" /* choose*/
            "fcmge  v2.4s,    v20.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v20.4s,   v1.4s   \n" /* vmulq_f32 */
            "fcmge  v4.4s,    v21.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v5.4s,    v21.4s,   v1.4s   \n" /* vmulq_f32 */
            "fcmge  v6.4s,    v22.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v7.4s,    v22.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v20.16b,  v3.16b,   v2.16b  \n" /* choose*/
            "bif    v21.16b,  v5.16b,   v4.16b  \n" /* choose*/
            "bif    v22.16b,  v7.16b,   v6.16b  \n" /* choose*/
            "fcmge  v2.4s,    v23.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v23.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v23.16b,  v3.16b,   v2.16b  \n" /* choose*/
            "fcmge  v2.4s,    v24.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v24.4s,   v1.4s   \n" /* vmulq_f32 */
            "fcmge  v4.4s,    v25.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v5.4s,    v25.4s,   v1.4s   \n" /* vmulq_f32 */
            "fcmge  v6.4s,    v26.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v7.4s,    v26.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v24.16b,  v3.16b,   v2.16b  \n" /* choose*/
            "bif    v25.16b,  v5.16b,   v4.16b  \n" /* choose*/
            "bif    v26.16b,  v7.16b,   v6.16b  \n" /* choose*/
            "fcmge  v2.4s,    v27.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v27.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v27.16b,  v3.16b,   v2.16b  \n" /* choose*/
            "fcmge  v2.4s,    v28.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v28.4s,   v1.4s   \n" /* vmulq_f32 */
            "fcmge  v4.4s,    v29.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v5.4s,    v29.4s,   v1.4s   \n" /* vmulq_f32 */
            "fcmge  v6.4s,    v30.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v7.4s,    v30.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v28.16b,  v3.16b,   v2.16b  \n" /* choose*/
            "bif    v29.16b,  v5.16b,   v4.16b  \n" /* choose*/
            "bif    v30.16b,  v7.16b,   v6.16b  \n" /* choose*/
            "fcmge  v2.4s,    v31.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v31.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v31.16b,  v3.16b,   v2.16b  \n" /* choose*/
            "20:                                \n" /* act end */

            "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n"   /* store r0 */
            "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */
            "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */
            "st1 {v17.4s, v18.4s, v19.4s},[%[c_ptr3]], #48\n" /* store r3 */
            "st1 {v20.4s, v21.4s, v22.4s},[%[c_ptr4]], #48\n" /* store r4 */
            "st1 {v23.4s, v24.4s, v25.4s},[%[c_ptr5]], #48\n" /* store r5 */
            "st1 {v26.4s, v27.4s, v28.4s},[%[c_ptr6]], #48\n" /* store r6 */
            "st1 {v29.4s, v30.4s, v31.4s},[%[c_ptr7]], #48\n" /* store r7 */

            : [a_ptr] "+r"(a_ptr),
              [b_ptr] "+r"(b_ptr),
              [k] "+r"(k),
              [tail] "+r"(tail),
              [c_ptr0] "+r"(c_ptr0),
              [c_ptr1] "+r"(c_ptr1),
              [c_ptr2] "+r"(c_ptr2),
              [c_ptr3] "+r"(c_ptr3),
              [c_ptr4] "+r"(c_ptr4),
              [c_ptr5] "+r"(c_ptr5),
              [c_ptr6] "+r"(c_ptr6),
              [c_ptr7] "+r"(c_ptr7)
            : [bias_ptr] "r"(bias_local),
              [has_beta] "r"(has_beta),
              [beta] "r"(beta),
              [alpha] "r"(alpha),
              [flag_act] "r"(flag_act)
            : "cc","memory",
              "v0","v1","v2","v3","v4","v5","v6","v7",
              "v8","v9","v10","v11","v12","v13",
              "v14","v15","v16","v17","v18","v19",
              "v20","v21","v22","v23","v24","v25",
              "v26","v27","v28","v29","v30","v31");
        // clang-format on
        if (flag_p_remain && (xb == bblocks - 1)) {
          for (int i = 0; i < remain; ++i) {
            *pout0++ = cout0[i];
            *pout1++ = cout1[i];
            *pout2++ = cout2[i];
            *pout3++ = cout3[i];
            *pout4++ = cout4[i];
            *pout5++ = cout5[i];
            *pout6++ = cout6[i];
            *pout7++ = cout7[i];
          }
        }
      }
    }
  }
}

void sgemm_prepacked_4x8(bool is_transB,
                         int M,
                         int N,
                         int K,
                         const float *A_packed,
                         const float *B,
                         int ldb,
                         float beta,
                         float *C,
                         int ldc,
                         const float *bias,
                         bool has_bias,
                         const operators::ActivationParam act_param,
                         ARMContext *ctx) {
  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
  auto *workspace = ctx->workspace_data<float>();
  int threads = ctx->threads();
  auto act_type = act_param.active_type;
  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
  int flag_act = 0x00;  // relu: 1, relu6: 2, leaky: 4
  const int n_block = 8;
  const int m_block = 4;
  if (act_param.has_active) {
    if (act_type == lite_api::ActivationType::kRelu) {
      flag_act = 0x01;
    } else if (act_type == lite_api::ActivationType::kRelu6) {
      flag_act = 0x02;
      float local_alpha = act_param.Relu_clipped_coef;
      alpha[0] = local_alpha;
      alpha[1] = local_alpha;
      alpha[2] = local_alpha;
      alpha[3] = local_alpha;
    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
      flag_act = 0x03;
      float local_alpha = act_param.Leaky_relu_alpha;
      alpha[0] = local_alpha;
      alpha[1] = local_alpha;
      alpha[2] = local_alpha;
      alpha[3] = local_alpha;
    }
  }
  float32x4_t valpha = vld1q_f32(alpha);
  float32x4_t vzero = vdupq_n_f32(0.f);
  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
  int x_block = (l2_cache - (m_block * K)) / (sizeof(float) * (K + m_block));
  x_block /= n_block;
  x_block *= n_block;
  int x_num = (N + (x_block - 1)) / x_block;
  x_block = (N + x_num - 1) / x_num;
  x_block = (x_block + n_block - 1) / n_block;
  x_block *= n_block;
  x_block = x_block < n_block ? n_block : x_block;

  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
  int tail_pre = (K & (KBLOCK - 1));
  if (tail_pre == 0) {
    tail_pre = KBLOCK;
  }
  bool flag_p_remain = false;
  int remain = 0;

  int has_beta = fabsf(beta) > 1e-8f ? 1 : 0;
  //! apanel is pre_compute outside gemm
  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
    unsigned int xmax = x0 + x_block;
    if (xmax > N) {
      xmax = N;
    }
    int bblocks = (xmax - x0 + n_block - 1) / n_block;
    remain = xmax - x0 - (bblocks - 1) * n_block;
    if (remain > 0) {
      flag_p_remain = true;
    }
    //! load bpanel
    auto b_pannel = static_cast<float *>(workspace);
    if (is_transB) {
      loadb_trans_eight(b_pannel, B, ldb, 0, K, x0, xmax);
    } else {
      loadb_eight(b_pannel, B, ldb, 0, K, x0, xmax);
    }
#pragma omp parallel for num_threads(threads)
    for (unsigned int y = 0; y < M; y += m_block) {
      unsigned int ymax = y + m_block;
      if (ymax > M) {
        ymax = M;
      }
      float cout0[n_block];  // NOLINT
      float cout1[n_block];  // NOLINT
      float cout2[n_block];  // NOLINT;
      float cout3[n_block];  // NOLINT

      float bias_local[4] = {0};
      if (has_bias) {
        bias_local[0] = bias[y];
        bias_local[1] = bias[y + 1];
        bias_local[2] = bias[y + 2];
        bias_local[3] = bias[y + 3];
      }

      float *c_ptr0 = C + y * ldc + x0;
      float *c_ptr1 = c_ptr0 + ldc;
      float *c_ptr2 = c_ptr1 + ldc;
      float *c_ptr3 = c_ptr2 + ldc;

      float *pout0 = c_ptr0;
      float *pout1 = c_ptr1;
      float *pout2 = c_ptr2;
      float *pout3 = c_ptr3;

      const float *a_ptr_l = A_packed + y * K;
      const float *b_ptr = b_pannel;
      for (int xb = 0; xb < bblocks; xb++) {
        if ((y + 3) >= ymax) {
          switch ((y + 3) - ymax) {
            case 2:
              c_ptr1 = cout1;
            case 1:
              c_ptr2 = cout1;
            case 0:
              c_ptr3 = cout1;
            default:
              break;
          }
        }
        if (flag_p_remain && (xb == bblocks - 1)) {
          pout0 = c_ptr0;
          pout1 = c_ptr1;
          pout2 = c_ptr2;
          pout3 = c_ptr3;

          c_ptr0 = cout0;
          c_ptr1 = cout1;
          c_ptr2 = cout2;
          c_ptr3 = cout3;

          if (has_beta) {
            for (int i = 0; i < remain; ++i) {
              cout0[i] = pout0[i];
              cout1[i] = pout1[i];
              cout2[i] = pout2[i];
              cout3[i] = pout3[i];
            }
          }
        }
        const float *a_ptr = a_ptr_l;
        int tails = tail_pre;
        int k = k_pre;
        // clang-format off
        asm volatile(
            "ld1 {v2.4s}, [%[bias_ptr]]\n"
            "dup v8.4s, v2.s[0]\n"
            "prfm   pldl1keep, [%[a_ptr]]\n"
            "dup v9.4s, v2.s[0]\n"
            "prfm   pldl1keep, [%[b_ptr]]\n"
            "dup v10.4s, v2.s[1]\n"
            "dup v11.4s, v2.s[1]\n"
            "prfm   pldl1keep, [%[a_ptr], #64]\n"
            "dup v12.4s, v2.s[2]\n"
            "dup v13.4s, v2.s[2]\n"
            "prfm   pldl1keep, [%[b_ptr], #64]\n"
            "dup v14.4s, v2.s[3]\n"
            "dup v15.4s, v2.s[3]\n"
            "prfm   pldl1keep, [%[a_ptr], #128]\n"
            "cmp    %w[beta], #0\n"                // check beta == 0?
            "prfm   pldl1keep, [%[b_ptr], #128]\n"
            "prfm   pldl1keep, [%[b_ptr], #192]\n"
            // process beta
            "beq    11f\n"
            "dup    v7.4s, %w[beta]\n"              // beta to vector
            "ld1 {v0.4s, v1.4s}, [%[c_ptr0]]\n"     // load output r0
            "ld1 {v2.4s, v3.4s}, [%[c_ptr1]]\n"     // load output r1
            "fmla v8.4s, v0.4s, v7.4s\n"            // cr00 += beta * c_r00
            "fmla v9.4s, v1.4s, v7.4s\n"            // cr01 += beta * c_r01
            "ld1 {v0.4s, v1.4s}, [%[c_ptr2]]\n"     // load output r2
            "fmla v10.4s, v2.4s, v7.4s\n"           // cr10 += beta * c_r10
            "fmla v11.4s, v3.4s, v7.4s\n"           // cr11 += beta * c_r11
            "ld1 {v2.4s, v3.4s}, [%[c_ptr3]]\n"     // load output r3
            "fmla v12.4s, v0.4s, v7.4s\n"           // cr20 += beta * c_r20
            "fmla v13.4s, v1.4s, v7.4s\n"           // cr21 += beta * c_r21
            "fmla v14.4s, v2.4s, v7.4s\n"           // cr30 += beta * c_r30
            "fmla v15.4s, v3.4s, v7.4s\n"           // cr31 += beta * c_r31
            "11: \n"                                // check loop count
            "ldp  q0, q1, [%[a_ptr]], #32\n"        // load a0~a3 to q0, q1
            "ldp  q4, q5, [%[b_ptr]], #32\n"        // load b0~b3 to q4, q5
            "cbz        %w[k], 0f\n"                // check loop count > 0
            // main loop
            // Unroll 0
            "1: \n"
            "fmla v8.4s, v4.4s, v0.s[0]\n"          // out0 += b0 * a0[0]
            "fmla v10.4s, v4.4s, v0.s[1]\n"         // out1 += b0 * a0[1]
            "ldp  q6, q7, [%[b_ptr]], #32\n"        // load next b1, b2
            "fmla v12.4s, v4.4s, v0.s[2]\n"         // out2 += b0 * a0[2]
            "fmla v14.4s, v4.4s, v0.s[3]\n"         // out3 += b0 * a0[3]
            "ldp  q2, q3, [%[a_ptr]], #32\n"        // load next 2xa0~a3
            "fmla v9.4s, v5.4s, v0.s[0]\n"          // out4 += b1 * a0[0]
            "fmla v11.4s, v5.4s, v0.s[1]\n"         // out5 += b1 * a0[1]
            "fmla v13.4s, v5.4s, v0.s[2]\n"         // out6 += b1 * a0[2]
            "fmla v15.4s, v5.4s, v0.s[3]\n"         // out7 += b1 * a0[3]
            "ldp  q4, q5, [%[b_ptr]], #32\n"        // load b0~b3 to q4, q5
            // Unroll 1
            "fmla v8.4s, v6.4s, v1.s[0]\n"          // out0 += b0 * a0[0]
            "prfm   pldl1keep, [%[b_ptr], #192]\n"
            "fmla v10.4s, v6.4s, v1.s[1]\n"         // out1 += b0 * a0[1]
            "fmla v12.4s, v6.4s, v1.s[2]\n"         // out1 += b0 * a0[2]
            "fmla v14.4s, v6.4s, v1.s[3]\n"         // out1 += b0 * a0[3]
            "fmla v9.4s, v7.4s, v1.s[0]\n"          // out4 += b1 * a0[0]
            "fmla v11.4s, v7.4s, v1.s[1]\n"         // out5 += b1 * a0[1]
            "fmla v13.4s, v7.4s, v1.s[2]\n"         // out6 += b1 * a0[2]
            "fmla v15.4s, v7.4s, v1.s[3]\n"         // out7 += b1 * a0[3]
            "ldp  q6, q7, [%[b_ptr]], #32\n"        // load next b1, b2
            // Unroll 2
            "fmla v8.4s, v4.4s, v2.s[0]\n"          // out0 += b0 * a0[0]
            "ldp  q0, q1, [%[a_ptr]], #32\n"        // load a0~a3 to q0, q1
            "fmla v10.4s, v4.4s, v2.s[1]\n"         // out1 += b0 * a0[1]
            "fmla v12.4s, v4.4s, v2.s[2]\n"         // out1 += b0 * a0[2]
            "fmla v14.4s, v4.4s, v2.s[3]\n"         // out1 += b0 * a0[3]
            "fmla v9.4s, v5.4s, v2.s[0]\n"          // out4 += b1 * a0[0]
            "fmla v11.4s, v5.4s, v2.s[1]\n"         // out5 += b1 * a0[1]
            "fmla v13.4s, v5.4s, v2.s[2]\n"         // out6 += b1 * a0[2]
            "fmla v15.4s, v5.4s, v2.s[3]\n"         // out7 += b1 * a0[3]
            "ldp  q4, q5, [%[b_ptr]], #32\n"        // load b0~b3 to q4, q5
            // Unroll 3
            "fmla v8.4s, v6.4s, v3.s[0]\n"          // out0 += b0 * a0[0]
            "prfm   pldl1keep, [%[a_ptr], #128]\n"
            "fmla v10.4s, v6.4s, v3.s[1]\n"         // out1 += b0 * a0[1]
            "fmla v12.4s, v6.4s, v3.s[2]\n"         // out1 += b0 * a0[2]
            "fmla v14.4s, v6.4s, v3.s[3]\n"         // out1 += b0 * a0[3]
            "subs       %w[k], %w[k], #1\n"         // loop count - 1
            "fmla v9.4s, v7.4s, v3.s[0]\n"          // out4 += b1 * a0[0]
            "fmla v11.4s, v7.4s, v3.s[1]\n"         // out5 += b1 * a0[1]
            "fmla v13.4s, v7.4s, v3.s[2]\n"         // out6 += b1 * a0[2]
            "fmla v15.4s, v7.4s, v3.s[3]\n"         // out7 += b1 * a0[3]
            "bne        1b\n"
            "0: \n"
            "subs %w[tail], %w[tail], #1\n"         // tail--
            "beq                3f\n"               // jump to tail = 1
            // Unroll 0
            "ldp  q6, q7, [%[b_ptr]], #32\n"        // load next b1, b2
            "fmla v8.4s, v4.4s, v0.s[0]\n"          // out0 += b0 * a0[0]
            "fmla v10.4s, v4.4s, v0.s[1]\n"         // out1 += b0 * a0[1]
            "subs %w[tail], %w[tail], #1\n"         // tail--
            "fmla v12.4s, v4.4s, v0.s[2]\n"         // out2 += b0 * a0[2]
            "fmla v14.4s, v4.4s, v0.s[3]\n"         // out3 += b0 * a0[3]
            "fmla v9.4s, v5.4s, v0.s[0]\n"          // out4 += b1 * a0[0]
            "fmla v11.4s, v5.4s, v0.s[1]\n"         // out5 += b1 * a0[1]
            "fmla v13.4s, v5.4s, v0.s[2]\n"         // out6 += b1 * a0[2]
            "fmla v15.4s, v5.4s, v0.s[3]\n"         // out7 += b1 * a0[3]
            "beq  4f\n"                             // jump to tail = 2
            // Unroll 1
            "ldp  q4, q5, [%[b_ptr]], #32\n"        // load b0~b3 to q4, q5
            "fmla v8.4s, v6.4s, v1.s[0]\n"          // out0 += b0 * a0[0]
            "ldp  q2, q3, [%[a_ptr]], #32\n"        // load next 2xa0~a3
            "fmla v10.4s, v6.4s, v1.s[1]\n"         // out1 += b0 * a0[1]
            "subs %w[tail], %w[tail], #1\n"         // tail--*/
            "fmla v12.4s, v6.4s, v1.s[2]\n"         // out1 += b0 * a0[2]
            "fmla v14.4s, v6.4s, v1.s[3]\n"         // out1 += b0 * a0[3]
            "fmla v9.4s, v7.4s, v1.s[0]\n"          // out4 += b1 * a0[0]
            "fmla v11.4s, v7.4s, v1.s[1]\n"         // out5 += b1 * a0[1]
            "fmla v13.4s, v7.4s, v1.s[2]\n"         // out6 += b1 * a0[2]
            "fmla v15.4s, v7.4s, v1.s[3]\n"         // out7 += b1 * a0[3]
            "beq  5f\n"                             // jump to tail = 3
            // Unroll 2
            "ldp  q6, q7, [%[b_ptr]], #32\n"        // load next b1, b2
            "fmla v8.4s, v4.4s, v2.s[0]\n"          // out0 += b0 * a0[0]
            "fmla v10.4s, v4.4s, v2.s[1]\n"         // out1 += b0 * a0[1]
            "fmla v12.4s, v4.4s, v2.s[2]\n"         // out2 += b0 * a0[2]
            "fmla v14.4s, v4.4s, v2.s[3]\n"         // out3 += b0 * a0[3]
            "fmla v9.4s, v5.4s, v2.s[0]\n"          // out4 += b1 * a0[0]
            "fmla v11.4s, v5.4s, v2.s[1]\n"         // out5 += b1 * a0[1]
            "fmla v13.4s, v5.4s, v2.s[2]\n"         // out6 += b1 * a0[2]
            "fmla v15.4s, v5.4s, v2.s[3]\n"         // out7 += b1 * a0[3]
            // Unroll 3
            "fmla v8.4s, v6.4s, v3.s[0]\n"          // out0 += b0 * a0[0]
            "fmla v10.4s, v6.4s, v3.s[1]\n"         // out1 += b0 * a0[1]
            "fmla v12.4s, v6.4s, v3.s[2]\n"         // out1 += b0 * a0[2]
            "fmla v14.4s, v6.4s, v3.s[3]\n"         // out1 += b0 * a0[3]
            "fmla v9.4s, v7.4s, v3.s[0]\n"          // out4 += b1 * a0[0]
            "fmla v11.4s, v7.4s, v3.s[1]\n"         // out5 += b1 * a0[1]
            "fmla v13.4s, v7.4s, v3.s[2]\n"         // out6 += b1 * a0[2]
            "fmla v15.4s, v7.4s, v3.s[3]\n"         // out7 += b1 * a0[3]
            "b		2f\n"
            // tails==1 final tail
            "3: \n"
            "fmla v8.4s, v4.4s, v0.s[0]\n"          // out0 += b0 * a0[0]
            "fmla v10.4s, v4.4s, v0.s[1]\n"         // out1 += b0 * a0[1]
            "fmla v12.4s, v4.4s, v0.s[2]\n"         // out2 += b0 * a0[2]
            "fmla v14.4s, v4.4s, v0.s[3]\n"         // out3 += b0 * a0[3]
            "fmla v9.4s, v5.4s, v0.s[0]\n"          // out4 += b1 * a0[0]
            "fmla v11.4s, v5.4s, v0.s[1]\n"         // out5 += b1 * a0[1]
            "fmla v13.4s, v5.4s, v0.s[2]\n"         // out6 += b1 * a0[2]
            "fmla v15.4s, v5.4s, v0.s[3]\n"         // out7 += b1 * a0[3]
            // aptr - 16
            "sub %w[a_ptr], %w[a_ptr], #16\n"
            "b		2f\n"
            "4: \n"
            // tails==2 final tail
            "fmla v8.4s, v6.4s, v1.s[0]\n"          // out0 += b0 * a0[0]
            "fmla v10.4s, v6.4s, v1.s[1]\n"         // out1 += b0 * a0[1]
            "fmla v12.4s, v6.4s, v1.s[2]\n"         // out1 += b0 * a0[2]
            "fmla v14.4s, v6.4s, v1.s[3]\n"         // out1 += b0 * a0[3]
            "fmla v9.4s, v7.4s, v1.s[0]\n"          // out4 += b1 * a0[0]
            "fmla v11.4s, v7.4s, v1.s[1]\n"         // out5 += b1 * a0[1]
            "fmla v13.4s, v7.4s, v1.s[2]\n"         // out6 += b1 * a0[2]
            "fmla v15.4s, v7.4s, v1.s[3]\n"         // out7 += b1 * a0[3]
            "b		2f\n"
            // tails==3 final tail
            "5: \n"
            "fmla v8.4s, v4.4s, v2.s[0]\n"          // out0 += b0 * a0[0]
            "fmla v10.4s, v4.4s, v2.s[1]\n"         // out1 += b0 * a0[1]
            "fmla v12.4s, v4.4s, v2.s[2]\n"         // out2 += b0 * a0[2]
            "fmla v14.4s, v4.4s, v2.s[3]\n"         // out3 += b0 * a0[3]
            "fmla v9.4s, v5.4s, v2.s[0]\n"          // out4 += b1 * a0[0]
            "fmla v11.4s, v5.4s, v2.s[1]\n"         // out5 += b1 * a0[1]
            "fmla v13.4s, v5.4s, v2.s[2]\n"         // out6 += b1 * a0[2]
            "fmla v15.4s, v5.4s, v2.s[3]\n"         // out7 += b1 * a0[3]
            // aptr - 16
            "sub %w[a_ptr], %w[a_ptr], #16\n"
            "2: \n"
            "cmp %w[flag_act], #0\n"                // check if has act
            "beq 10f\n"
            "cmp %w[flag_act], #1\n"                // check if has relu
            "bne 6f\n"
            "fmax   v8.4s,  v8.4s,  %[vzero].4s\n"
            "fmax   v9.4s,  v9.4s,  %[vzero].4s\n"
            "fmax   v10.4s, v10.4s, %[vzero].4s\n"
            "fmax   v11.4s, v11.4s, %[vzero].4s\n"
            "fmax   v12.4s, v12.4s, %[vzero].4s\n"
            "fmax   v13.4s, v13.4s, %[vzero].4s\n"
            "fmax   v14.4s, v14.4s, %[vzero].4s\n"
            "fmax   v15.4s, v15.4s, %[vzero].4s\n"
            "b 10f\n"                               // end
            "6: \n"
            "cmp  %w[flag_act],  #2\n"             // check relu6
            "bne 7f\n"
            "fmax   v8.4s,  v8.4s,  %[vzero].4s\n"
            "fmax   v9.4s,  v9.4s,  %[vzero].4s\n"
            "fmax   v10.4s, v10.4s, %[vzero].4s\n"
            "fmax   v11.4s, v11.4s, %[vzero].4s\n"
            "fmax   v12.4s, v12.4s, %[vzero].4s\n"
            "fmax   v13.4s, v13.4s, %[vzero].4s\n"
            "fmax   v14.4s, v14.4s, %[vzero].4s\n"
            "fmax   v15.4s, v15.4s, %[vzero].4s\n"
            "fmin   v8.4s,  v8.4s,  %[valpha].4s\n"
            "fmin   v9.4s,  v9.4s,  %[valpha].4s\n"
            "fmin   v10.4s, v10.4s, %[valpha].4s\n"
            "fmin   v11.4s, v11.4s, %[valpha].4s\n"
            "fmin   v12.4s, v12.4s, %[valpha].4s\n"
            "fmin   v13.4s, v13.4s, %[valpha].4s\n"
            "fmin   v14.4s, v14.4s, %[valpha].4s\n"
            "fmin   v15.4s, v15.4s, %[valpha].4s\n"
            "b 10f\n"
            "7: \n"
            "fcmge  v2.4s,    v8.4s, %[vzero].4s\n"
            "fmul   v3.4s,    v8.4s, %[valpha].4s\n"
            "fcmge  v4.4s,    v9.4s, %[vzero].4s\n"
            "fmul   v5.4s,    v9.4s, %[valpha].4s\n"
            "fcmge  v6.4s,    v10.4s, %[vzero].4s\n"
            "fmul   v7.4s,    v10.4s, %[valpha].4s\n"
            "fcmge  v0.4s,    v11.4s, %[vzero].4s\n"
            "fmul   v1.4s,    v11.4s, %[valpha].4s\n"
            "bif    v8.16b,   v3.16b,   v2.16b  \n"
            "bif    v9.16b,   v5.16b,   v4.16b  \n"
            "bif    v10.16b,  v7.16b,   v6.16b  \n"
            "bif    v11.16b,  v1.16b,   v0.16b  \n"
            "fcmge  v2.4s,    v12.4s, %[vzero].4s\n"
            "fmul   v3.4s,    v12.4s, %[valpha].4s\n"
            "fcmge  v4.4s,    v13.4s, %[vzero].4s\n"
            "fmul   v5.4s,    v13.4s,   v1.4s   \n"
            "fcmge  v6.4s,    v14.4s, %[vzero].4s\n"
            "fmul   v7.4s,    v14.4s, %[valpha].4s\n"
            "fcmge  v0.4s,    v15.4s, %[vzero].4s\n"
            "fmul   v1.4s,    v15.4s, %[valpha].4s\n"
            "bif    v12.16b,  v3.16b,   v2.16b  \n"
            "bif    v13.16b,  v5.16b,   v4.16b  \n"
            "bif    v14.16b,  v7.16b,   v6.16b  \n"
            "bif    v15.16b,  v1.16b,   v0.16b  \n"
            "10: \n"
            "st1 {v8.4s, v9.4s},[%[c_ptr0]], #32\n"
            "st1 {v10.4s, v11.4s},[%[c_ptr1]], #32\n"
            "st1 {v12.4s, v13.4s},[%[c_ptr2]], #32\n"
            "st1 {v14.4s, v15.4s},[%[c_ptr3]], #32\n"
            : [a_ptr] "+r"(a_ptr),
              [b_ptr] "+r"(b_ptr),
              [c_ptr0] "+r"(c_ptr0),
              [c_ptr1] "+r"(c_ptr1),
              [c_ptr2] "+r"(c_ptr2),
              [c_ptr3] "+r"(c_ptr3),
              [k] "+r"(k),
              [tail] "+r"(tails)
            : [bias_ptr] "r"(bias_local),
              [beta] "r"(beta),
              [alpha] "r"(alpha),
              [flag_act] "r"(flag_act),
              [vzero] "w"(vzero),
              [valpha] "w"(valpha)
            : "cc","memory",
              "v0","v1","v2","v3","v4","v5","v6","v7",
              "v8","v9","v10","v11","v12","v13",
              "v14","v15");
        // clang-format on
        if (flag_p_remain && (xb == bblocks - 1)) {
          for (int i = 0; i < remain; ++i) {
            *pout0++ = cout0[i];
            *pout1++ = cout1[i];
            *pout2++ = cout2[i];
            *pout3++ = cout3[i];
          }
        }
      }
    }
  }
}

void sgemm_prepacked_4x4(bool is_transB,
                         int M,
                         int N,
                         int K,
                         const float *A_packed,
                         const float *B,
                         int ldb,
                         float beta,
                         float *C,
                         int ldc,
                         const float *bias,
                         bool has_bias,
                         const operators::ActivationParam act_param,
                         ARMContext *ctx) {
  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
  auto workspace = ctx->workspace_data<float>();
  int threads = ctx->threads();

  auto act_type = act_param.active_type;
  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
  if (act_param.has_active) {
    if (act_type == lite_api::ActivationType::kRelu) {
      flag_act = 0x01;
    } else if (act_type == lite_api::ActivationType::kRelu6) {
      flag_act = 0x02;
      float local_alpha = act_param.Relu_clipped_coef;
      alpha[0] = local_alpha;
      alpha[1] = local_alpha;
      alpha[2] = local_alpha;
      alpha[3] = local_alpha;
    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
      flag_act = 0x03;
      float local_alpha = act_param.Leaky_relu_alpha;
      alpha[0] = local_alpha;
      alpha[1] = local_alpha;
      alpha[2] = local_alpha;
      alpha[3] = local_alpha;
    }
  }
  const int n_block = 4;
  const int m_block = 4;
  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
  int x_block = (l2_cache - (m_block * K)) / (sizeof(float) * (K + m_block));
  x_block /= n_block;
  x_block *= n_block;
  int x_num = (N + (x_block - 1)) / x_block;
  x_block = (N + x_num - 1) / x_num;
  x_block = (x_block + n_block - 1) / n_block;
  x_block *= n_block;
  x_block = x_block < n_block ? n_block : x_block;

  // unroll 2 loop
  int tail_pre = (K & (KBLOCK - 1));
  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
  if (tail_pre == 0) {
    tail_pre = KBLOCK;
  }

  bool flag_p_remain = false;
  int remain = 0;

  int has_beta = fabsf(beta) > 1e-8f ? 1 : 0;
  //! apanel is pre_compute outside gemm
  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
    unsigned int xmax = x0 + x_block;
    if (xmax > N) {
      xmax = N;
    }
    int bblocks = (xmax - x0 + n_block - 1) / n_block;
    remain = xmax - x0 - (bblocks - 1) * n_block;
    if (remain > 0) {
      flag_p_remain = true;
    }
    //! load bpanel
    float *b_pannel = workspace;
    if (is_transB) {
      pack_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K);
    } else {
      pack_trans_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K);
    }
#pragma omp parallel for num_threads(threads)
    for (unsigned int y = 0; y < M; y += m_block) {
      unsigned int ymax = y + m_block;
      if (ymax > M) {
        ymax = M;
      }

      float bias_local[4] = {0};
      if (has_bias) {
        bias_local[0] = bias[y];
        bias_local[1] = bias[y + 1];
        bias_local[2] = bias[y + 2];
        bias_local[3] = bias[y + 3];
      }

      float cout0[n_block];  // NOLINT
      float cout1[n_block];  // NOLINT
      float cout2[n_block];  // NOLINT
      float cout3[n_block];  // NOLINT

      float *c_ptr0 = C + y * ldc + x0;
      float *c_ptr1 = c_ptr0 + ldc;
      float *c_ptr2 = c_ptr1 + ldc;
      float *c_ptr3 = c_ptr2 + ldc;

      float *pout0 = c_ptr0;
      float *pout1 = c_ptr1;
      float *pout2 = c_ptr2;
      float *pout3 = c_ptr3;

      const float *a_ptr_l = A_packed + y * K;
      const float *b_ptr_l = b_pannel;
      for (int xb = 0; xb < bblocks; xb++) {
        if ((y + 3) >= ymax) {
          switch ((y + 3) - ymax) {
            case 2:
              c_ptr1 = cout1;
            case 1:
              c_ptr2 = cout2;
            case 0:
              c_ptr3 = cout3;
            default:
              break;
          }
        }
        if (flag_p_remain && (xb == bblocks - 1)) {
          pout0 = c_ptr0;
          pout1 = c_ptr1;
          pout2 = c_ptr2;
          pout3 = c_ptr3;

          c_ptr0 = cout0;
          c_ptr1 = cout1;
          c_ptr2 = cout2;
          c_ptr3 = cout3;
          if (has_beta) {
            for (int i = 0; i < remain; ++i) {
              cout0[i] = pout0[i];
              cout1[i] = pout1[i];
              cout2[i] = pout2[i];
              cout3[i] = pout3[i];
            }
          }
        }
        const float *a_ptr = a_ptr_l;
        const float *b_ptr = b_ptr_l + xb * K * 4;
        int tail = tail_pre;
        int k = k_pre;
        // clang-format off
        asm volatile(
            "prfm   pldl1keep, [%[a_ptr]]\n"       /* preload a*/
            "ld1    {v2.4s}, [%[bias_ptr]]\n"         /* load bias to q2, q3*/
            "dup    v8.4s,  v2.s[0]\n"               /* out0 = 0 */
            "prfm   pldl1keep, [%[b_ptr]]\n"       /* preload b*/
            "dup    v9.4s,  v2.s[1]\n"               /* out1 = 0*/
            "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/
            "dup    v10.4s, v2.s[2]\n"               /* out2 = 0*/
            "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/
            "dup    v11.4s, v2.s[3]\n"               /* out3 = 0*/
            "cbz    %w[has_beta], 0f\n"            /* check beta == 0? */
            /* process beta */
            "dup    v7.4s, %w[beta]\n"                    /* beta to vector */
            "ld1    {v0.4s}, [%[c_ptr0]]\n" /* load output r0 */
            "ld1    {v1.4s}, [%[c_ptr1]]\n" /* load output r1 */
            "fmla   v8.4s, v0.4s, v7.4s\n"  /* cr00 += beta * c_r00*/
            "fmla   v9.4s, v1.4s, v7.4s\n"  /* cr10 += beta * c_r10*/
            "ld1    {v2.4s}, [%[c_ptr2]]\n"
            "ld1    {v3.4s}, [%[c_ptr3]]\n"
            "fmla   v10.4s, v2.4s, v7.4s\n" /* cr20 += beta * c_r20*/
            "fmla   v11.4s, v3.4s, v7.4s\n" /* cr30 += beta * c_r30*/

            "0: \n"                          /* check loop count */
            "ldp	q0, q1, [%[a_ptr]], #32\n" /* load a00,a10 to q0, q1*/
            "ldp	q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/
            "cbz	%w[k], 2f\n"               /* check loop count > 0 */
            /* main loop */
            /* unrool 0*/
            "1:\n"                              /* main loop */
            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
            "fmla  	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
            "ldp	q6, q7, [%[b_ptr]], #32\n"    /* load b2, b3 to q6, q7 */
            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */

            "ldp	q2, q3, [%[a_ptr]], #32\n"    /* load a20, a30 to q2, q3 */
            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
            "fmla	v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5 */
            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */
            "ldp	q4, q5, [%[b_ptr]], #32\n"    /* load b0, b1 to q4, q5*/

            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b2 =q6 */
            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b2 =q6 */
            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b2 =q6*/
            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b2 =q6*/
            "ldp	q0, q1, [%[a_ptr]], #32\n"    /* load a00, a10 to q0, q1 */

            "fmla	v8.4s,   v7.4s,  v3.s[0]\n"   /* out0 = b3 * a30[0], b3 =q7*/
            "fmla	v9.4s,   v7.4s,  v3.s[1]\n"   /* out1 = b3 * a30[1], b3 =q7*/
            "subs	%w[k], %w[k], #1\n"         /* loop count - 1*/
            "fmla	v10.4s,  v7.4s,  v3.s[2]\n"   /* out2 = b3 * a30[2], b3 =q7*/
            "fmla	v11.4s,  v7.4s,  v3.s[3]\n"   /* out3 = b3 * a30[3], b3 =q7*/

            "bne	1b\n"
            "2:\n"                            /* process tail*/
            "subs		%w[tail], %w[tail], #1\n" /* tail--*/
            "beq		3f\n"                     /*jump to tail = 1*/
            /* final unrool 0*/
            /* unrool 0, tail > 1*/
            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
            "fmla	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
            "subs	%w[tail], %w[tail], #1\n"      /* tail--*/
            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */

            "beq		4f\n"                     /*jump to tail = 2*/
            /* unrool 1, tail > 2*/
            "ldp	q6, q7, [%[b_ptr]], #32\n"    /* load b2, b3 to q6, q7 */

            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
            "fmla       v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5*/
            "subs	%w[tail], %w[tail], #1\n"      /* tail--*/
            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */
            "ldp	q2, q3, [%[a_ptr]], #32\n"    /* load a20, a30 to q2, q3 */

            "beq		5f\n"                       /*jump to tail = 3*/
            /* unrool 2, tail = 4*/
            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b1 =q6 */
            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b1 =q6 */
            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b1 =q6*/
            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b1 =q6*/

            /* unrool 3, tail = 4*/

            "fmla	v8.4s,   v7.4s,  v3.s[0]\n"   /* out0 = b3 * a30[0], b3 =q7*/
            "fmla	v9.4s,   v7.4s,  v3.s[1]\n"   /* out1 = b3 * a30[1], b3 =q7*/
            "fmla	v10.4s,  v7.4s,  v3.s[2]\n"   /* out2 = b3 * a30[2], b3 =q7*/
            "fmla	v11.4s,  v7.4s,  v3.s[3]\n"   /* out3 = b3 * a30[3], b3 =q7*/

            "b		11f\n"
            /* tails==1 final tail*/
            "3: \n"                            /* tail=1*/
            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
            "fmla	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */

            "b		11f\n"
            /* tails==2 final tail*/
            "4:\n"                              /* tail = 2*/

            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
            "fmla       v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5*/
            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */

            "b		11f\n"
            /* tails==3 final tail*/
            "5:\n"                              /* tail = 3*/
            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b1 =q6 */
            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b1 =q6 */
            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b1 =q6*/
            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b1 =q6*/

            "11:                           \n"   /* check activation */
            "cmp    %w[flag_act],   #1     \n"   /* check if has relu */
            "bne    12f                    \n"   /* jump if no relu */
            "movi   v0.4s,  #0             \n"   /* for relu*/
            "fmax   v8.4s,  v8.4s,  v0.4s  \n"   /* relu*/
            "fmax   v9.4s,  v9.4s,  v0.4s  \n"   /* relu*/
            "fmax   v10.4s, v10.4s, v0.4s  \n"   /* relu*/
            "fmax   v11.4s, v11.4s, v0.4s  \n"   /* relu*/
            "b      20f                    \n"   /* relu end */
            //! no act
            "12:                           \n"   /* no relu */
            "cmp   %w[flag_act],  #0       \n"   /* check no act */
            "beq   20f                     \n"   /* no act end */
            //! relu6
            "cmp    %w[flag_act],  #2      \n"    /* check if has relu6 */
            "bne    13f                    \n"    /* jump if no relu6 */
            "movi   v0.4s, #0              \n"    /* for relu6 */
            "ld1    {v1.4s}, [%[alpha]]    \n"    /* relu6 alpha */
            "fmax   v8.4s,  v8.4s,  v0.4s  \n"    /* relu6 */
            "fmax   v9.4s,  v9.4s,  v0.4s  \n"    /* relu6 */
            "fmax   v10.4s, v10.4s, v0.4s  \n"    /* relu6 */
            "fmax   v11.4s, v11.4s, v0.4s  \n"    /* relu6 */

            "fmin   v8.4s,  v8.4s,  v1.4s  \n"    /* relu6*/
            "fmin   v9.4s,  v9.4s,  v1.4s  \n"    /* relu6*/
            "fmin   v10.4s, v10.4s, v1.4s  \n"    /* relu6*/
            "fmin   v11.4s, v11.4s, v1.4s  \n"    /* relu6*/
            "b      20f                    \n"    /* relu6 end */
            //! leakey relu
            "13:                                \n" /* otherwise is leakey relu */
            "movi   v0.4s,    #0                \n" /* for leakey relu */
            "ld1    {v1.4s},  [%[alpha]]        \n" /* leakey relu alpha */
            "fcmge  v2.4s,    v8.4s,    v0.4s   \n" /* vcgeq_f32 */
            "fmul   v3.4s,    v8.4s,    v1.4s   \n" /* vmulq_f32 */
            "fcmge  v4.4s,    v9.4s,    v0.4s   \n" /* vcgeq_f32 */
            "fmul   v5.4s,    v9.4s,    v1.4s   \n" /* vmulq_f32 */
            "fcmge  v6.4s,    v10.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v7.4s,    v10.4s,   v1.4s   \n" /* vmulq_f32 */
            "fcmge  v12.4s,   v11.4s,   v0.4s   \n" /* vcgeq_f32 */
            "fmul   v13.4s,   v11.4s,   v1.4s   \n" /* vmulq_f32 */
            "bif    v8.16b,   v3.16b,   v2.16b  \n" /* choose*/
            "bif    v9.16b,   v5.16b,   v4.16b  \n" /* choose*/
            "bif    v10.16b,  v7.16b,   v6.16b  \n" /* choose*/
            "bif    v11.16b,  v13.16b,  v12.16b \n" /* choose*/
            "20:                                \n" /* act end */
            "st1 {v8.4s}, [%[c_ptr0]], #16\n"   /* store r0 */
            "st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */
            "st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */
            "st1 {v11.4s}, [%[c_ptr3]], #16\n" /* store r3 */

            : [a_ptr] "+r"(a_ptr),
              [b_ptr] "+r"(b_ptr),
              [k] "+r"(k),
              [tail] "+r"(tail),
              [c_ptr0] "+r"(c_ptr0),
              [c_ptr1] "+r"(c_ptr1),
              [c_ptr2] "+r"(c_ptr2),
              [c_ptr3] "+r"(c_ptr3)
            : [bias_ptr] "r"(bias_local),
              [has_beta] "r"(has_beta),
              [beta] "r"(beta),
              [alpha] "r"(alpha),
              [flag_act] "r"(flag_act)
            : "cc","memory",
              "v0","v1","v2","v3","v4","v5","v6","v7",
              "v8","v9","v10","v11");
        // clang-format on
        if (flag_p_remain && (xb == bblocks - 1)) {
          for (int i = 0; i < remain; ++i) {
            *pout0++ = cout0[i];
            *pout1++ = cout1[i];
            *pout2++ = cout2[i];
            *pout3++ = cout3[i];
          }
        }
      }
    }
  }
}
#else  // __aarch64__
/**
 * \brief gemm with ablock = 6, bblock = 8, output 6x8
 * @param A
 * @param B
 * @param C
 * @param M
 * @param N
 * @param K
 * @param threads
 * @param workspace
 */
void sgemm_prepacked_6x8(bool is_transB,
                         int M,
                         int N,
                         int K,
                         const float* A_packed,
                         const float* B,
                         int ldb,
                         float beta,
                         float* C,
                         int ldc,
                         const float* bias,
                         bool has_bias,
                         const operators::ActivationParam act_param,
                         ARMContext* ctx) {
  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
  auto* workspace = ctx->workspace_data<float>();
  int threads = ctx->threads();
  auto act_type = act_param.active_type;
  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
  if (act_param.has_active) {
    if (act_type == lite_api::ActivationType::kRelu) {
      flag_act = 0x01;
    } else if (act_type == lite_api::ActivationType::kRelu6) {
      flag_act = 0x02;
      float local_alpha = act_param.Relu_clipped_coef;
      alpha[0] = local_alpha;
      alpha[1] = local_alpha;
      alpha[2] = local_alpha;
      alpha[3] = local_alpha;
    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
      flag_act = 0x03;
      float local_alpha = act_param.Leaky_relu_alpha;
      alpha[0] = local_alpha;
      alpha[1] = local_alpha;
      alpha[2] = local_alpha;
      alpha[3] = local_alpha;
    }
  }
  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
  int x_block =
      (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH));
  x_block /= NBLOCK;
  x_block *= NBLOCK;
  int x_num = (N + (x_block - 1)) / x_block;
  x_block = (N + x_num - 1) / x_num;
  x_block = (x_block + NBLOCK - 1) / NBLOCK;
  x_block *= NBLOCK;
  x_block = x_block < NBLOCK ? NBLOCK : x_block;

  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
  int tail_pre = (K & (KBLOCK - 1));
  if (tail_pre == 0) {
    tail_pre = KBLOCK;
  }

  //! merge tail_pre and flag_act
  tail_pre = (tail_pre << 2 | flag_act);
  bool flag_p_remain = false;
  int remain = 0;

  int has_beta = fabsf(beta) > 1e-8f ? 1 : 0;

  //! apanel is pre_compute outside gemm
  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
    unsigned int xmax = x0 + x_block;
    if (xmax > N) {
      xmax = N;
    }
    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
    if (remain > 0) {
      flag_p_remain = true;
    }
    //! load bpanel
    auto b_pannel = static_cast<float*>(workspace);
    if (is_transB) {
      loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax);
    } else {
      loadb(b_pannel, B, ldb, 0, K, x0, xmax);
    }
#pragma omp parallel for num_threads(threads)
    for (unsigned int y = 0; y < M; y += MBLOCK_OTH) {
      unsigned int ymax = y + MBLOCK_OTH;
      if (ymax > M) {
        ymax = M;
      }
      float* c_ptr0 = C + y * ldc + x0;
      float* c_ptr1 = c_ptr0 + ldc;
      float* c_ptr2 = c_ptr1 + ldc;
      float* c_ptr3 = c_ptr2 + ldc;
      float* c_ptr4 = c_ptr3 + ldc;
      float* c_ptr5 = c_ptr4 + ldc;

      float* pout0 = c_ptr0;
      float* pout1 = c_ptr1;
      float* pout2 = c_ptr2;
      float* pout3 = c_ptr3;
      float* pout4 = c_ptr4;
      float* pout5 = c_ptr5;

      float bias_local[6] = {0};
      if (has_bias) {
        bias_local[0] = bias[y];
        bias_local[1] = bias[y + 1];
        bias_local[2] = bias[y + 2];
        bias_local[3] = bias[y + 3];
        bias_local[4] = bias[y + 4];
        bias_local[5] = bias[y + 5];
      }

      float cout0[NBLOCK];
      float cout1[NBLOCK];
      float cout2[NBLOCK];
      float cout3[NBLOCK];
      float cout4[NBLOCK];
      float cout5[NBLOCK];

      const float* a_ptr_l = A_packed + y * K;
      const float* b_ptr = b_pannel;
      for (int xb = 0; xb < bblocks; xb++) {
        if ((y + 5) >= ymax) {
          switch ((y + 5) - ymax) {
            case 4:
              c_ptr1 = cout1;
            case 3:
              c_ptr2 = cout2;
            case 2:
              c_ptr3 = cout3;
            case 1:
              c_ptr4 = cout4;
            case 0:
              c_ptr5 = cout5;
            default:
              break;
          }
        }
        if (flag_p_remain && (xb == bblocks - 1)) {
          pout0 = c_ptr0;
          pout1 = c_ptr1;
          pout2 = c_ptr2;
          pout3 = c_ptr3;
          pout4 = c_ptr4;
          pout5 = c_ptr5;

          c_ptr0 = cout0;
          c_ptr1 = cout1;
          c_ptr2 = cout2;
          c_ptr3 = cout3;
          c_ptr4 = cout4;
          c_ptr5 = cout5;
          if (has_beta) {
            for (int i = 0; i < remain; ++i) {
              cout0[i] = pout0[i];
              cout1[i] = pout1[i];
              cout2[i] = pout2[i];
              cout3[i] = pout3[i];
              cout4[i] = pout4[i];
              cout5[i] = pout5[i];
            }
          }
        }
        const float* a_ptr = a_ptr_l;
        int tails = tail_pre;
        int k = k_pre;
        // clang-format off
        asm volatile(
            // sgemm 6x8
            "vld1.32	{d2-d4}, [%[bias_ptr]]      @ load bias 6 elements\n"
            "pld [%[a_ptr]]                         @ preload a\n"
            "vdup.i32	q12,d4[0]                   @ out40=0\n"
            "pld [%[b_ptr]]                         @ preload b\n"
            "vdup.i32	q13,d4[0]                   @ out41=0\n"
            "pld [%[a_ptr], #64]                    @ preload a\n"
            "vdup.i32	q14,d4[1]                   @ out50=0\n"
            "pld [%[b_ptr], #64]                    @ preload b\n"
            "vdup.i32	q15,d4[1]                   @ out51=0\n"
            "pld [%[a_ptr], #128]                   @ preload a\n"
            "vdup.i32	q4, d2[0]                   @ out00=0\n"
            "pld [%[b_ptr], #128]                   @ preload b\n"
            "vdup.i32	q5, d2[0]                   @ out01=0\n"
            "vdup.i32	q6, d2[1]                   @ out10=0\n"
            "pld [%[a_ptr], #192]                   @ preload a\n"
            "vdup.i32	q7, d2[1]                   @ out11=0\n"
            "pld [%[b_ptr], #192]                   @ preload a\n"
            "vdup.i32	q8, d3[0]                   @ out20=0\n"
            "pld [%[a_ptr], #256]                   @ preload a\n"
            "vdup.i32	q9, d3[0]                   @ out21=0\n"
            "pld [%[b_ptr], #256]                   @ preload a\n"
            "vdup.i32	q10,d3[1]                   @ out30=0\n"
            "pld [%[b_ptr], #320]                   @ preload b\n"
            "vdup.i32	q11,d3[1]                   @ out31=0\n"
            "pld [%[b_ptr], #384]                   @ preload b\n"
            "cmp %[beta], #0\n"
            "beq    11f\n" /* check beta == 0? */
            /* process beta */
            "vdup.32    q3, %[beta]\n"          /* beta to vector */
            "vld1.32    {d0-d3}, [%[c_ptr0]]\n" /* load output r0 */
            "vmla.f32   q4, q0, q3\n"           /* cr00 += beta * c_r00 */
            "vmla.f32   q5, q1, q3\n"           /* cr01 += beta * c_r01 */
            "vld1.32    {d0-d3}, [%[c_ptr1]]\n" /* load output r1 */
            "vmla.f32   q6, q0, q3\n"           /* cr10 += beta * c_r10 */
            "vmla.f32   q7, q1, q3\n"           /* cr11 += beta * c_r11 */
            "vld1.32    {d0-d3}, [%[c_ptr2]]\n" /* load output r2 */
            "vmla.f32   q8, q0, q3\n"           /* cr20 += beta * c_r20 */
            "vmla.f32   q9, q1, q3\n"           /* cr21 += beta * c_r21 */
            "vld1.32    {d0-d3}, [%[c_ptr3]]\n" /* load output r3 */
            "vmla.f32   q10, q0, q3\n"          /* cr30 += beta * c_r30 */
            "vmla.f32   q11, q1, q3\n"          /* cr31 += beta * c_r31 */
            "vld1.32    {d0-d3}, [%[c_ptr4]]\n" /* load output r4 */
            "vmla.f32   q12, q0, q3\n"          /* cr40 += beta * c_r40 */
            "vmla.f32   q13, q1, q3\n"          /* cr41 += beta * c_r41 */
            "vld1.32    {d0-d3}, [%[c_ptr5]]\n" /* load output r5 */
            "vmla.f32   q14, q0, q3\n"          /* cr50 += beta * c_r50 */
            "vmla.f32   q15, q1, q3\n"          /* cr51 += beta * c_r51 */
            "11: \n"                            /* check loop count */
            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a0~a3\n"
            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
            "cmp %[k], #0                     @ check weather k is bigger than "
            "0\n"
            "beq 0f                                 @ jump to tail\n"
            "1:                                     @ main loop for k\n"
            /* Unroll 0*/
            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a4, a5, and next a0, "
            "a1\n"
            "vmla.f32	q4, q2, d0[0]               @ out0 += b1 * a0\n"
            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
            "vmla.f32	q6, q2, d0[1]               @ out1 += b1 * a1\n"
            "vmla.f32	q8, q2, d1[0]               @ out2 += b1 * a2\n"
            "vmla.f32	q10, q2, d1[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q12, q2, d2[0]              @ out4 += b1 * a4\n"
            "vmla.f32	q14, q2, d2[1]              @ out5 += b1 * a5\n"
            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
            "vmla.f32	q5, q3, d0[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q7, q3, d0[1]               @ out7 += b2 * a1\n"
            "vmla.f32	q9, q3, d1[0]               @ out8 += b2 * a2\n"
            "vmla.f32	q11, q3, d1[1]              @ out9 += b2 * a3\n"
            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a2~a5\n"
            "vmla.f32	q13, q3, d2[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d2[1]              @ out11 += b2 * a5\n"
            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
            /* Unroll 1 */
            "vmla.f32	q4, q2, d3[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q6, q2, d3[1]               @ out1 += b1 * a1\n"
            /*"pld [%[a_ptr], #64]                    @ preload a\n"*/
            "vmla.f32	q8, q2, d0[0]               @ out2 += b1 * a2\n"
            "vmla.f32	q10, q2, d0[1]              @ out3 += b1 * a3\n"
            /*"pld [%[b_ptr], #192]\n"*/
            "vmla.f32	q12, q2, d1[0]              @ out4 += b1 * a4\n"
            "vmla.f32	q14, q2, d1[1]              @ out5 += b1 * a5\n"
            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
            "vmla.f32	q5, q3, d3[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q7, q3, d3[1]               @ out7 += b2 * a1\n"
            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a0~a3\n"
            "vmla.f32	q9, q3, d0[0]               @ out8 += b2 * a2\n"
            "vmla.f32	q11, q3, d0[1]              @ out9 += b2 * a3\n"
            "vmla.f32	q13, q3, d1[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d1[1]              @ out11 += b2 * a5\n"
            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a4, a5, a0, a1\n"
            /* Unroll 2 */
            "vmla.f32	q4, q2, d2[0]               @ out0 += b1 * a0\n"
            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
            "vmla.f32	q6, q2, d2[1]               @ out1 += b1 * a1\n"
            "vmla.f32	q8, q2, d3[0]               @ out2 += b1 * a2\n"
            "vmla.f32	q10, q2, d3[1]              @ out3 += b1 * a3\n"
            /*"pld [%[a_ptr], #240]                   @ preload\n"*/
            "vmla.f32	q12, q2, d0[0]              @ out4 += b1 * a4\n"
            "vmla.f32	q14, q2, d0[1]              @ out5 += b1 * a5\n"
            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
            "vmla.f32	q5, q3, d2[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q7, q3, d2[1]               @ out7 += b2 * a1\n"
            /*"pld [%[b_ptr], #208]\n"*/
            "vmla.f32	q9, q3, d3[0]               @ out8 += b2 * a2\n"
            "vmla.f32	q11, q3, d3[1]              @ out9 += b2 * a3\n"
            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a2~a5\n"
            "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
            /* Unroll 3 */
            "vmla.f32	q4, q2, d1[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q6, q2, d1[1]               @ out1 += b1 * a1\n"
            "vmla.f32	q8, q2, d2[0]               @ out2 += b1 * a2\n"
            "vmla.f32	q10, q2, d2[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q12, q2, d3[0]              @ out4 += b1 * a4\n"
            "vmla.f32	q14, q2, d3[1]              @ out5 += b1 * a5\n"
            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
            "vmla.f32	q5, q3, d1[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q7, q3, d1[1]               @ out7 += b2 * a1\n"
            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a0~a3\n"
            "vmla.f32	q9, q3, d2[0]               @ out8 += b2 * a2\n"
            "vmla.f32	q11, q3, d2[1]              @ out9 += b2 * a3\n"
            "subs		%[k], %[k], #1                @ k--\n"
            "vmla.f32	q13, q3, d3[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d3[1]              @ out11 += b2 * a5\n"
            "bne		1b                            @ jump to main loop\n"
            "0:                                   @ process tail\n"
            "sub		%[tails], %[tails], #4        @ tail--\n"
            "cmp    %[tails], #4                  @ cmp with act bits\n"
            "blt		3f                            @ jump to tail = 1\n"
            /* Unroll 0*/
            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
            "vmla.f32	q4, q2, d0[0]               @ out0 += b1 * a0\n"
            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a4,5, a0, a1\n"
            "vmla.f32	q6, q2, d0[1]               @ out1 += b1 * a1\n"
            "vmla.f32	q8, q2, d1[0]               @ out2 += b1 * a2\n"
            "vmla.f32	q10, q2, d1[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q12, q2, d2[0]              @ out4 += b1 * a4\n"
            "sub		%[tails], %[tails], #4        @ tail--\n"
            "vmla.f32	q14, q2, d2[1]              @ out5 += b1 * a5\n"
            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
            "cmp    %[tails], #4                  @ cmp with act bits\n"
            "vmla.f32	q5, q3, d0[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q7, q3, d0[1]               @ out7 += b2 * a1\n"
            "vmla.f32	q9, q3, d1[0]               @ out8 += b2 * a2\n"
            "vmla.f32	q11, q3, d1[1]              @ out9 += b2 * a3\n"
            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a2~a5\n"
            "vmla.f32	q13, q3, d2[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d2[1]              @ out11 += b2 * a5\n"
            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
            "blt		4f                            @ jump to tail==2\n"
            /* Unroll 1*/
            "vmla.f32	q4, q2, d3[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q6, q2, d3[1]               @ out1 += b1 * a1\n"
            "sub		%[tails], %[tails], #4        @ tail--\n"
            "vmla.f32	q8, q2, d0[0]               @ out2 += b1 * a2\n"
            "vmla.f32	q10, q2, d0[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q12, q2, d1[0]              @ out4 += b1 * a4\n"
            "vmla.f32	q14, q2, d1[1]              @ out5 += b1 * a5\n"
            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
            "cmp    %[tails],  #4                 @ cmp with act bits\n"
            "vmla.f32	q5, q3, d3[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q7, q3, d3[1]               @ out7 += b2 * a1\n"
            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a0~a3\n"
            "vmla.f32	q9, q3, d0[0]               @ out8 += b2 * a2\n"
            "vmla.f32	q11, q3, d0[1]              @ out9 += b2 * a3\n"
            "vmla.f32	q13, q3, d1[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d1[1]              @ out11 += b2 * a5\n"
            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
            "blt		5f                            @ jump to tail==3\n"
            /* Unroll 2 */
            "sub		%[tails], %[tails], #4        @ tail--\n"
            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a4,a5, a0,a1\n"
            "vmla.f32	q4, q2, d2[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q6, q2, d2[1]               @ out1 += b1 * a1\n"
            "vmla.f32	q8, q2, d3[0]               @ out2 += b1 * a2\n"
            "vmla.f32	q10, q2, d3[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q12, q2, d0[0]              @ out4 += b1 * a4\n"
            "vmla.f32	q14, q2, d0[1]              @ out5 += b1 * a5\n"
            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
            "vmla.f32	q5, q3, d2[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q7, q3, d2[1]               @ out7 += b2 * a1\n"
            "vmla.f32	q9, q3, d3[0]               @ out8 += b2 * a2\n"
            "vmla.f32	q11, q3, d3[1]              @ out9 += b2 * a3\n"
            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a2~a5\n"
            "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
            /* Unroll 3*/
            "vmla.f32	q4, q2, d1[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q6, q2, d1[1]               @ out1 += b1 * a1\n"
            "vmla.f32	q8, q2, d2[0]               @ out2 += b1 * a2\n"
            "vmla.f32	q10, q2, d2[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q12, q2, d3[0]              @ out4 += b1 * a4\n"
            "vmla.f32	q14, q2, d3[1]              @ out5  += b1 * a5\n"
            "vmla.f32	q5, q3, d1[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q7, q3, d1[1]               @ out7 += b2 * a1\n"
            "vmla.f32	q9, q3, d2[0]               @ out8 += b2 * a2\n"
            "vmla.f32	q11, q3, d2[1]              @ out9 += b2 * a3\n"
            "vmla.f32	q13, q3, d3[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d3[1]              @ out11 += b2 * a5\n"
            "b		2f\n"
            /* tails==1 final tail*/
            "3:                                     @ tail=1\n"
            "vmla.f32	q4, q2, d0[0]               @ out0 += b1 * a0\n"
            "vld1.32	{d2}, [%[a_ptr] :64]!       @ load a4,a5\n"
            "vmla.f32	q6, q2, d0[1]               @ out1 += b1 * a1\n"
            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
            "vmla.f32	q8, q2, d1[0]               @ out2 += b1 * a2\n"
            "vmla.f32	q10, q2, d1[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q12, q2, d2[0]              @ out4 += b1 * a4\n"
            "vmla.f32	q14, q2, d2[1]              @ out5 += b1 * a5\n"
            "vmla.f32	q5, q3, d0[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q7, q3, d0[1]               @ out7 += b2 * a1\n"
            "vmla.f32	q9, q3, d1[0]               @ out8 += b2 * a2\n"
            "vmla.f32	q11, q3, d1[1]              @ out9 += b2 * a3\n"
            "vmla.f32	q13, q3, d2[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d2[1]              @ out11 += b2 * a5\n"
            "b		2f                              @ jump to end\n"
            /* tails==2 final tail*/
            "4:                                     @ tail == 2\n"
            "vmla.f32	q4, q2, d3[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q6, q2, d3[1]               @ out1 += b1 * a1\n"
            "vmla.f32	q8, q2, d0[0]               @ out2 += b1 * a2\n"
            "vmla.f32	q10, q2, d0[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q12, q2, d1[0]              @ out4 += b1 * a4\n"
            "vmla.f32	q14, q2, d1[1]              @ out5 += b1 * a5\n"
            "vmla.f32	q5, q3, d3[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q7, q3, d3[1]               @ out7 += b2 * a1\n"
            "vmla.f32	q9, q3, d0[0]               @ out8 += b2 * a2\n"
            "vmla.f32	q11, q3, d0[1]              @ out9 += b2 * a3\n"
            "vmla.f32	q13, q3, d1[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d1[1]              @ out11 += b2 * a5\n"
            "b		2f                              @ jump to end\n"
            /* tails==3 final tail*/
            "5:                                     @ tail=3\n"
            "vmla.f32	q4, q2, d2[0]               @ out0 += b1 * a0\n"
            "vld1.32	{d0}, [%[a_ptr] :64]!       @ load a4,a5\n"
            "vmla.f32	q6, q2, d2[1]               @ out1 += b1 * a1\n"
            "vmla.f32	q8, q2, d3[0]               @ out2 += b1 * a2\n"
            "vmla.f32	q10, q2, d3[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q12, q2, d0[0]              @ out4 += b1 * a4\n"
            "vmla.f32	q14, q2, d0[1]              @ out5 += b1 * a5\n"
            "vmla.f32	q5, q3, d2[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q7, q3, d2[1]               @ out7 += b2 * a1\n"
            "vmla.f32	q9, q3, d3[0]               @ out8 += b2 * a2\n"
            "vmla.f32	q11, q3, d3[1]              @ out9 += b2 * a3\n"
            "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
            "2:                                   @ check activation\n"
            //!   relu
            "cmp        %[tails], #1              @ check if has relu\n"
            "bne        6f                        @ jump if not relu \n"
            "vmov.u32   q0, #0                    @ for relu\n"
            "vmax.f32   q4, q4, q0                @ for relu\n"
            "vmax.f32   q5, q5, q0                @ for relu\n"
            "vmax.f32   q6, q6, q0                @ for relu\n"
            "vmax.f32   q7, q7, q0                @ for relu\n"
            "vmax.f32   q8, q8, q0                @ for relu\n"
            "vmax.f32   q9, q9, q0                @ for relu\n"
            "vmax.f32   q10, q10, q0              @ for relu\n"
            "vmax.f32   q11, q11, q0              @ for relu\n"
            "vmax.f32   q12, q12, q0              @ for relu\n"
            "vmax.f32   q13, q13, q0              @ for relu\n"
            "vmax.f32   q14, q14, q0              @ for relu\n"
            "vmax.f32   q15, q15, q0              @ for relu\n"
            "b          10f                       @ relu end\n"
            "6:                                   @ no relu \n"
            "cmp        %[tails], #0              @ check no act\n"
            "beq        10f                       @ no act end  \n"
            //!   relu6
            "cmp        %[tails], #2              @ check if has relu6\n"
            "bne        7f                        @ jump if no relu6 \n"
            "vmov.u32   q0, #0                    @ for relu6\n"
            "vmax.f32   q4, q4, q0                @ for relu6\n"
            "vmax.f32   q5, q5, q0                @ for relu6\n"
            "vmax.f32   q6, q6, q0                @ for relu6\n"
            "vmax.f32   q7, q7, q0                @ for relu6\n"
            "vmax.f32   q8, q8, q0                @ for relu6\n"
            "vmax.f32   q9, q9, q0                @ for relu6\n"
            "vld1.f32   {d2-d3}, [%[alpha]]       @ load relu6 alpha\n"
            "vmax.f32   q10, q10, q0              @ for relu6\n"
            "vmax.f32   q11, q11, q0              @ for relu6\n"
            "vmax.f32   q12, q12, q0              @ for relu6\n"
            "vmax.f32   q13, q13, q0              @ for relu6\n"
            "vmax.f32   q14, q14, q0              @ for relu6\n"
            "vmax.f32   q15, q15, q0              @ for relu6\n"

            "vmin.f32   q4, q4, q1                @ for relu6\n"
            "vmin.f32   q5, q5, q1                @ for relu6\n"
            "vmin.f32   q6, q6, q1                @ for relu6\n"
            "vmin.f32   q7, q7, q1                @ for relu6\n"
            "vmin.f32   q8, q8, q1                @ for relu6\n"
            "vmin.f32   q9, q9, q1                @ for relu6\n"
            "vmin.f32   q10, q10, q1              @ for relu6\n"
            "vmin.f32   q11, q11, q1              @ for relu6\n"
            "vmin.f32   q12, q12, q1              @ for relu6\n"
            "vmin.f32   q13, q13, q1              @ for relu6\n"
            "vmin.f32   q14, q14, q1              @ for relu6\n"
            "vmin.f32   q15, q15, q1              @ for relu6\n"
            "b          10f                       @ relu6 end \n"
            //! leakey relu
            "7:                                   @ otherwise is leakey relu\n"
            "vmov.u32   q0,   #0                  @ for leakey relu \n"
            "vld1.f32   {d2-d3}, [%[alpha]]       @ load leakey relu alpha\n"
            "vcge.f32   q2, q4, q0                @ vcgeq_u32 \n"
            "vmul.f32   q3, q4, q1                @ vmulq_f32 \n"
            "vbif       q4, q3, q2                @ choose    \n"
            "vcge.f32   q2, q5, q0                @ vcgeq_u32 \n"
            "vmul.f32   q3, q5, q1                @ vmulq_f32 \n"
            "vbif       q5, q3, q2                @ choose    \n"
            "vcge.f32   q2, q6, q0                @ vcgeq_u32 \n"
            "vmul.f32   q3, q6, q1                @ vmulq_f32 \n"
            "vbif       q6, q3, q2                @ choose    \n"
            "vcge.f32   q2, q7, q0                @ vcgeq_u32 \n"
            "vmul.f32   q3, q7, q1                @ vmulq_f32 \n"
            "vbif       q7, q3, q2                @ choose    \n"
            "vcge.f32   q2, q8, q0                @ vcgeq_u32 \n"
            "vmul.f32   q3, q8, q1                @ vmulq_f32 \n"
            "vbif       q8, q3, q2                @ choose    \n"
            "vcge.f32   q2, q9, q0                @ vcgeq_u32 \n"
            "vmul.f32   q3, q9, q1                @ vmulq_f32 \n"
            "vbif       q9, q3, q2                @ choose    \n"
            "vcge.f32   q2, q10, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q10, q1               @ vmulq_f32 \n"
            "vbif       q10, q3, q2               @ choose    \n"
            "vcge.f32   q2, q11, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q11, q1               @ vmulq_f32 \n"
            "vbif       q11, q3, q2               @ choose    \n"
            "vcge.f32   q2, q12, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q12, q1               @ vmulq_f32 \n"
            "vbif       q12, q3, q2               @ choose    \n"
            "vcge.f32   q2, q13, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q13, q1               @ vmulq_f32 \n"
            "vbif       q13, q3, q2               @ choose    \n"
            "vcge.f32   q2, q14, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q14, q1               @ vmulq_f32 \n"
            "vbif       q14, q3, q2               @ choose    \n"
            "vcge.f32   q2, q15, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q15, q1               @ vmulq_f32 \n"
            "vbif       q15, q3, q2               @ choose    \n"
            "10:                                  @ act end  \n"
            "vst1.32    {d8-d11},   [%[c_ptr0]]!    @ store r0\n"
            "vst1.32    {d12-d15},  [%[c_ptr1]]!    @ store r1\n"
            "vst1.32    {d16-d19},  [%[c_ptr2]]!    @ store r2\n"
            "vst1.32    {d20-d23},  [%[c_ptr3]]!    @ store r3\n"
            "vst1.32    {d24-d27},  [%[c_ptr4]]!    @ store r4\n"
            "vst1.32    {d28-d31},  [%[c_ptr5]]!    @ store r5\n"
            : [a_ptr] "+r"(a_ptr),
              [b_ptr] "+r"(b_ptr),
              [c_ptr0] "+r"(c_ptr0),
              [c_ptr1] "+r"(c_ptr1),
              [c_ptr2] "+r"(c_ptr2),
              [c_ptr3] "+r"(c_ptr3),
              [c_ptr4] "+r"(c_ptr4),
              [c_ptr5] "+r"(c_ptr5),
              [k] "+r"(k),
              [tails] "+r"(tails)
            : [bias_ptr] "r"(bias_local),
              [beta] "r"(beta),
              [alpha] "r" (alpha)
            : "q0","q1","q2","q3","q4",
              "q5","q6","q7","q8","q9","q10","q11",
              "q12","q13","q14","q15","cc","memory");
        // clang-format on

        if (flag_p_remain && (xb == bblocks - 1)) {
          for (int i = 0; i < remain; ++i) {
            *pout0++ = cout0[i];
            *pout1++ = cout1[i];
            *pout2++ = cout2[i];
            *pout3++ = cout3[i];
            *pout4++ = cout4[i];
            *pout5++ = cout5[i];
          }
        }
      }
    }
  }
}

/**
 * \brief gemm with ablock = 6, bblock = 8, output 6x8, optimize for a53 arch
 * @param A
 * @param B
 * @param C
 * @param M
 * @param N
 * @param K
 * @param threads
 * @param workspace
 */
void sgemm_prepacked_6x8_a53(bool is_transB,
                             int M,
                             int N,
                             int K,
                             const float* A_packed,
                             const float* B,
                             int ldb,
                             float* C,
                             int ldc,
                             const float* bias,
                             bool has_bias,
                             int is_relu,
                             ARMContext* ctx) {
  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
  auto* workspace = ctx->workspace_data<float>();
  int threads = ctx->threads();
  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
  int x_block =
      (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH));
  x_block /= NBLOCK;
  x_block *= NBLOCK;
  int x_num = (N + (x_block - 1)) / x_block;
  x_block = (N + x_num - 1) / x_num;
  x_block = (x_block + NBLOCK - 1) / NBLOCK;
  x_block *= NBLOCK;
  x_block = x_block < NBLOCK ? NBLOCK : x_block;

  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
  int tail_pre = (K & (KBLOCK - 1));
  if (tail_pre == 0) {
    tail_pre = KBLOCK;
  }

  //! merge tail_pre and flag_act
  tail_pre = (tail_pre << 2 | is_relu);
  bool flag_p_remain = false;
  int remain = 0;

  //! apanel is pre_compute outside gemm
  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
    unsigned int xmax = x0 + x_block;
    if (xmax > N) {
      xmax = N;
    }
    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
    if (remain > 0) {
      flag_p_remain = true;
    }
    //! load bpanel
    auto b_pannel = static_cast<float*>(workspace);
    if (is_transB) {
      loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax);
    } else {
      loadb(b_pannel, B, ldb, 0, K, x0, xmax);
    }
#pragma omp parallel for num_threads(threads)
    for (unsigned int y = 0; y < M; y += MBLOCK_OTH) {
      unsigned int ymax = y + MBLOCK_OTH;
      if (ymax > M) {
        ymax = M;
      }
      float* c_ptr0 = C + y * ldc + x0;
      float* c_ptr1 = c_ptr0 + ldc;
      float* c_ptr2 = c_ptr1 + ldc;
      float* c_ptr3 = c_ptr2 + ldc;
      float* c_ptr4 = c_ptr3 + ldc;
      float* c_ptr5 = c_ptr4 + ldc;

      float* pout0 = c_ptr0;
      float* pout1 = c_ptr1;
      float* pout2 = c_ptr2;
      float* pout3 = c_ptr3;
      float* pout4 = c_ptr4;
      float* pout5 = c_ptr5;

      float bias_local[6] = {0};
      if (has_bias) {
        bias_local[0] = bias[y];
        bias_local[1] = bias[y + 1];
        bias_local[2] = bias[y + 2];
        bias_local[3] = bias[y + 3];
        bias_local[4] = bias[y + 4];
        bias_local[5] = bias[y + 5];
      }

      float cout0[NBLOCK];
      float cout1[NBLOCK];
      float cout2[NBLOCK];
      float cout3[NBLOCK];
      float cout4[NBLOCK];
      float cout5[NBLOCK];

      const float* a_ptr_l = A_packed + y * K;
      const float* b_ptr = b_pannel;
      for (int xb = 0; xb < bblocks; xb++) {
        if ((y + 5) >= ymax) {
          switch ((y + 5) - ymax) {
            case 4:
              c_ptr1 = cout1;
            case 3:
              c_ptr2 = cout2;
            case 2:
              c_ptr3 = cout3;
            case 1:
              c_ptr4 = cout4;
            case 0:
              c_ptr5 = cout5;
            default:
              break;
          }
        }
        if (flag_p_remain && (xb == bblocks - 1)) {
          pout0 = c_ptr0;
          pout1 = c_ptr1;
          pout2 = c_ptr2;
          pout3 = c_ptr3;
          pout4 = c_ptr4;
          pout5 = c_ptr5;

          c_ptr0 = cout0;
          c_ptr1 = cout1;
          c_ptr2 = cout2;
          c_ptr3 = cout3;
          c_ptr4 = cout4;
          c_ptr5 = cout5;
        }
        const float* a_ptr = a_ptr_l;
        int tails = tail_pre;
        int k = k_pre;

        // clang-format off
        asm volatile(
            // sgemm 6x8 for a53
            "vld1.32  {d2-d3},  [%[bias_ptr]]   \n"   /* load bias0-3 to d2,d3 */
            "vdup.i32 q4, d2[0]                 \n"   /*  set out00 to bias0   */
            "vld1.32	{d0-d1},  [%[a_ptr] :64]  \n"   /* load a00-a30 to d0,d1 */
            "vdup.i32	q5, d2[0]                 \n"   /*  set out01 to bias0   */
            "vld1.32	{d4-d5},  [%[b_ptr] :128] \n"   /* load b00-b03 to d4,d5 */
            "vdup.i32	q6, d2[1]                 \n"   /*  set out10 to bias1   */
            "ldr  r0, [%[a_ptr], #0x10]         \n"   /*    load a40 to r0     */
            "vdup.i32	q7, d2[1]                 \n"   /*  set out11 to bias1   */
            "ldr  r1, [%[a_ptr], #0x14]         \n"   /*    load a50 to r1     */
            "vdup.i32	q8, d3[0]                 \n"   /*  set out20 to bias2   */
            "vldr d6, [%[bias_ptr], #0x10]      \n"   /*  load bias 4,5 to d6  */
            "pld [%[a_ptr], #0x40]              \n"   /*    pre load apanel    */
            "vdup.i32	q9, d3[0]                 \n"   /*  set out21 to bias2   */
            "pld [%[b_ptr], #0x40]              \n"   /*    pre load bpanel    */
            "vdup.i32	q10, d3[1]                \n"   /*  set out30 to bias3   */
            "pld [%[a_ptr], #0x80]              \n"   /*    pre load apanel    */
            "vdup.i32	q11, d3[1]                \n"   /*  set out31 to bias3   */
            "pld [%[b_ptr], #0x80]              \n"   /*    pre load bpanel    */
            "vdup.i32	q12, d6[0]                \n"   /*  set out40 to bias4   */
            "vdup.i32	q13, d6[0]                \n"   /*  set out41 to bias4   */
            "pld [%[a_ptr], #0xC0]              \n"   /*    pre load apanel    */
            "vdup.i32	q14, d6[1]                \n"   /*  set out50 to bias5   */
            "pld [%[b_ptr], #0XC0]              \n"   /*    pre load bpanel    */
            "vdup.i32	q15, d6[1]                \n"   /*  set out51 to bias5   */
            "cmp  %[k], #0                      \n"   /*      check k loop     */
            "beq  6f                            \n"   /*   k==0, branch to 6   */
            "1:\n"
            /* Unroll 0 */
            "vldr d6, [%[b_ptr], #0x10]         \n"   /*  load b04, b05 to d6  */
            "vmov d2, r0, r1                    \n"   /*   mov a40, a50 to d2  */
            "vmla.f32	q4, q2, d0[0]             \n"   /*   out00 += a00 * b0l  */
            "ldr  r0, [%[b_ptr], #0x18]         \n"   /*    load b06 to r0     */
            "vmla.f32	q6, q2, d0[1]             \n"   /*   out10 += a10 * b0l  */
            "ldr  r1, [%[b_ptr], #0x1C]         \n"   /*    load b07 to r1     */
            "vmla.f32	q8, q2, d1[0]             \n"   /*   out20 += a20 * b0l  */
            "vldr d3, [%[a_ptr], #0x18]         \n"   /*  load a01, a11 to d3  */
            "vmov d7, r0, r1                    \n"   /*   mov b06, b07 to d7  */
            "vmla.f32	q10, q2, d1[1]            \n"   /*   out30 += a30 * b0l  */
            "pld [%[a_ptr], #0x100]             \n"   /*    pre load apanel    */
            "vmla.f32	q12, q2, d2[0]            \n"   /*   out40 += a40 * b0l  */
            "vmla.f32	q14, q2, d2[1]            \n"   /*   out50 += a50 * b0l  */
            "vldr d4, [%[b_ptr], #0x20]         \n"   /*  load b10, b11 to d4  */
            "vmla.f32	q5, q3, d0[0]             \n"   /*   out01 += a00 * b0h  */
            "ldr  r0, [%[b_ptr], #0x28]         \n"   /*    load b12 to r0     */
            "vmla.f32	q7, q3, d0[1]             \n"   /*   out11 += a10 * b0h  */
            "ldr  r1, [%[b_ptr], #0x2C]         \n"   /*    load b13 to r1     */
            "vmla.f32	q9, q3, d1[0]             \n"   /*   out21 += a20 * b0h  */
            "vldr d0, [%[a_ptr], #0x20]         \n"   /*  load a21, a31 to d0  */
            "vmov d5, r0, r1                    \n"   /*   mov b12, b13 to d5  */
            "vmla.f32	q11, q3, d1[1]            \n"   /*   out31 += a30 * b0h  */
            "ldr  r0, [%[a_ptr], #0x28]         \n"   /*    load a41 to r0     */
            "vmla.f32	q13, q3, d2[0]            \n"   /*   out41 += a40 * b0h  */
            "ldr  r1, [%[a_ptr], #0x2C]         \n"   /*    load a51 to r1     */
            "vmla.f32	q15, q3, d2[1]            \n"   /*   out51 += a50 * b0h  */
            /* Unroll 1 */
            "vldr d6, [%[b_ptr], #0x30]         \n"   /*  load b14, b15 to d6  */
            "vmov d1, r0, r1                    \n"   /*   mov a41, a51 to d1  */
            "vmla.f32	q4, q2, d3[0]             \n"   /*   out00 += a01 * b1l  */
            "ldr  r0, [%[b_ptr], #0x38]         \n"   /*    load b16 to r0     */
            "vmla.f32	q6, q2, d3[1]             \n"   /*   out10 += a11 * b1l  */
            "ldr  r1, [%[b_ptr], #0x3C]         \n"   /*    load b17 to r1     */
            "vmla.f32	q8, q2, d0[0]             \n"   /*   out20 += a21 * b1l  */
            "vldr d2, [%[a_ptr], #0x30]         \n"   /*  load a02, a12 to d0  */
            "vmov d7, r0, r1                    \n"   /*   mov b16, b17 to d7  */
            "vmla.f32	q10, q2, d0[1]            \n"   /*   out30 += a31 * b1l  */
            "pld [%[b_ptr], #0x100]             \n"   /*    pre load apanel    */
            "vmla.f32	q12, q2, d1[0]            \n"   /*   out40 += a41 * b1l  */
            "vmla.f32	q14, q2, d1[1]            \n"   /*   out50 += a51 * b1l  */
            "vldr d4, [%[b_ptr], #0x40]         \n"   /*  load b20, b21 to d4  */
            "vmla.f32	q5, q3, d3[0]             \n"   /*   out01 += a01 * b1h  */
            "ldr  r0, [%[b_ptr], #0x48]         \n"   /*    load b22 to r0     */
            "vmla.f32	q7, q3, d3[1]             \n"   /*   out11 += a11 * b1h  */
            "ldr  r1, [%[b_ptr], #0x4C]         \n"   /*    load b23 to r1     */
            "vmla.f32	q9, q3, d0[0]             \n"   /*   out21 += a21 * b1h  */
            "vldr d3, [%[a_ptr], #0x38]         \n"   /*  load a22, a32 to d3  */
            "vmov d5, r0, r1                    \n"   /*   mov b22, b23 to d5  */
            "vmla.f32	q11, q3, d0[1]            \n"   /*   out31 += a31 * b1h  */
            "ldr  r0, [%[a_ptr], #0x40]         \n"   /*    load a42 to r0     */
            "vmla.f32	q13, q3, d1[0]            \n"   /*   out41 += a41 * b1h  */
            "ldr  r1, [%[a_ptr], #0x44]         \n"   /*    load a52 to r1     */
            "vmla.f32	q15, q3, d1[1]            \n"   /*   out51 += a51 * b1h  */
            /* Unroll 2 */
            "vldr d6, [%[b_ptr], #0x50]         \n"   /*  load b24, b25 to d6  */
            "vmov d0, r0, r1                    \n"   /*   mov a42, a52 to d0  */
            "vmla.f32	q4, q2, d2[0]             \n"   /*   out00 += a02 * b2l  */
            "ldr  r0, [%[b_ptr], #0x58]         \n"   /*    load b26 to r0     */
            "vmla.f32	q6, q2, d2[1]             \n"   /*   out10 += a12 * b2l  */
            "ldr  r1, [%[b_ptr], #0x5C]         \n"   /*    load b27 to r1     */
            "vmla.f32	q8, q2, d3[0]             \n"   /*   out20 += a22 * b2l  */
            "vldr d1, [%[a_ptr], #0x48]         \n"   /*  load a03, a13 to d1  */
            "vmov d7, r0, r1                    \n"   /*   mov b26, b27 to d7  */
            "vmla.f32	q10, q2, d3[1]            \n"   /*   out30 += a32 * b2l  */
            "pld [%[a_ptr], #0x140]             \n"   /*    pre load apanel    */
            "vmla.f32	q12, q2, d0[0]            \n"   /*   out40 += a42 * b2l  */
            "vmla.f32	q14, q2, d0[1]            \n"   /*   out50 += a52 * b2l  */
            "vldr d4, [%[b_ptr], #0x60]         \n"   /*  load b30, b31 to d4  */
            "vmla.f32	q5, q3, d2[0]             \n"   /*   out01 += a02 * b2h  */
            "ldr  r0, [%[b_ptr], #0x68]         \n"   /*    load b32 to r0     */
            "vmla.f32	q7, q3, d2[1]             \n"   /*   out11 += a12 * b2h  */
            "ldr  r1, [%[b_ptr], #0x6C]         \n"   /*    load b33 to r1     */
            "vmla.f32	q9, q3, d3[0]             \n"   /*   out21 += a22 * b2h  */
            "vldr d2, [%[a_ptr], #0x50]         \n"   /*  load a23, a33 to d2  */
            "vmov d5, r0, r1                    \n"   /*   mov b32, b33 to d5  */
            "vmla.f32	q11, q3, d3[1]            \n"   /*   out31 += a32 * b2h  */
            "ldr  r0, [%[a_ptr], #0x58]         \n"   /*    load a43 to r0     */
            "vmla.f32	q13, q3, d0[0]            \n"   /*   out41 += a42 * b2h  */
            "ldr  r1, [%[a_ptr], #0x5C]         \n"   /*    load a53 to r1     */
            "vmla.f32	q15, q3, d0[1]            \n"   /*   out51 += a52 * b2h  */
            "add  %[a_ptr], %[a_ptr], #0x60     \n"   /*      aptr += 96       */
            /* Unroll 3 */
            "vldr d6, [%[b_ptr], #0x70]         \n"   /*  load b34, b35 to d6  */
            "vmov d3, r0, r1                    \n"   /*   mov a43, a53 to d3  */
            "vmla.f32	q4, q2, d1[0]             \n"   /*   out00 += a03 * b3l  */
            "ldr  r0, [%[b_ptr], #0x78]         \n"   /*    load b36 to r0     */
            "vmla.f32	q6, q2, d1[1]             \n"   /*   out10 += a13 * b3l  */
            "ldr  r1, [%[b_ptr], #0x7C]         \n"   /*    load b37 to r1     */
            "vmla.f32	q8, q2, d2[0]             \n"   /*   out20 += a23 * b3l  */
            "add  %[b_ptr], %[b_ptr], #0x80     \n"   /*      bptr += 108      */
            "vldr d0, [%[a_ptr], #0x00]         \n"   /*  load a00, a10 to d0  */
            "vmov d7, r0, r1                    \n"   /*   mov b36, b37 to d7  */
            "vmla.f32	q10, q2, d2[1]            \n"   /*   out30 += a33 * b3l  */
            "pld [%[b_ptr], #0xC0]              \n"   /*    pre load bpanel    */
            "vmla.f32	q12, q2, d3[0]            \n"   /*   out40 += a43 * b3l  */
            "vmla.f32	q14, q2, d3[1]            \n"   /*   out50 += a53 * b3l  */
            "vldr d4, [%[b_ptr], #0x00]         \n"   /*  load b00, b01 to d4  */
            "vmla.f32	q5, q3, d1[0]             \n"   /*   out01 += a03 * b3h  */
            "ldr  r0, [%[b_ptr], #0x08]         \n"   /*    load b02 to r0     */
            "vmla.f32	q7, q3, d1[1]             \n"   /*   out11 += a13 * b3h  */
            "ldr  r1, [%[b_ptr], #0x0C]         \n"   /*    load b03 to r1     */
            "vmla.f32	q9, q3, d2[0]             \n"   /*   out21 += a23 * b3h  */
            "subs %[k], %[k], #1                \n"   /*      loop k -= 1      */
            "vldr d1, [%[a_ptr], #0x08]         \n"   /*  load a20, a30 to d1  */
            "vmov d5, r0, r1                    \n"   /*   mov b02, b03 to d5  */
            "vmla.f32	q11, q3, d2[1]            \n"   /*   out31 += a33 * b3h  */
            "ldr  r0, [%[a_ptr], #0x10]         \n"   /*    load a40 to r0     */
            "vmla.f32	q13, q3, d3[0]            \n"   /*   out41 += a43 * b3h  */
            "ldr  r1, [%[a_ptr], #0x14]         \n"   /*    load a50 to r1     */
            "vmla.f32	q15, q3, d3[1]            \n"   /*   out51 += a53 * b3h  */
            "bne  1b                            \n"   /*    branch to k loop   */
            "6:\n"
            "sub  %[tails], %[tails], #4        \n"   /*       tail -= 4       */
            "cmp  %[tails], #4                  \n"   /*     cmp tail with 4   */
            "blt  3f                            \n"   /*   branch to tail == 1 */
            /* Tail Unroll 0 */
            "vmov d2, r0, r1                    \n"   /*   mov b02, b03 to d2  */
            "add  %[a_ptr], %[a_ptr], #0x18     \n"   /*      aptr +=  24      */
            "vmla.f32	q4, q2, d0[0]             \n"   /*   out00 += a00 * b0l  */
            "vld1.32  {d3}, [%[a_ptr] :64]!     \n"   /*  load a01, a11 to d3  */
            "vmla.f32	q6, q2, d0[1]             \n"   /*   out10 += a10 * b0l  */
            "add  %[b_ptr], %[b_ptr], #0x10     \n"   /*      bptr +=  16      */
            "vmla.f32	q8, q2, d1[0]             \n"   /*   out20 += a20 * b0l  */
            "vld1.32	{d6-d7}, [%[b_ptr] :128]! \n"   /* load b04-b07 to d6,d7 */
            "vmla.f32	q10, q2, d1[1]            \n"   /*   out30 += a30 * b0l  */
            "vmla.f32	q12, q2, d2[0]            \n"   /*   out40 += a40 * b0l  */
            "sub  %[tails], %[tails], #4        \n"   /*       tail -= 4       */
            "vmla.f32	q14, q2, d2[1]            \n"   /*   out50 += a50 * b0l  */
            "vld1.32	{d4-d5}, [%[b_ptr] :128]! \n"   /* load b10-b13 to d4,d5 */
            "vmla.f32	q5, q3, d0[0]             \n"   /*   out01 += a00 * b0h  */
            "vmla.f32	q7, q3, d0[1]             \n"   /*   out11 += a10 * b0h  */
            "vmla.f32	q9, q3, d1[0]             \n"   /*   out21 += a20 * b0h  */
            "vmla.f32	q11, q3, d1[1]            \n"   /*   out31 += a30 * b0h  */
            "vld1.32	{d0-d1}, [%[a_ptr] :64]!  \n"   /* load a21-a51 to d0,d1 */
            "cmp  %[tails], #4                  \n"   /*    cmp tail with 4    */
            "vmla.f32	q13, q3, d2[0]            \n"   /*   out41 += a40 * b0h  */
            "vmla.f32	q15, q3, d2[1]            \n"   /*   out51 += a50 * b0h  */
            "vld1.32	{d6-d7}, [%[b_ptr] :128]! \n"   /* load b14-b17 to d6,d7 */
            "blt  4f                            \n"   /*   branch to tail == 2 */
            /* Tail Unroll 1 */
            "vmla.f32	q4, q2, d3[0]             \n"   /*   out00 += a01 * b1l  */
            "vmla.f32	q6, q2, d3[1]             \n"   /*   out10 += a11 * b1l  */
            "sub  %[tails], %[tails], #4        \n"   /*       tail -= 4       */
            "vmla.f32	q8, q2, d0[0]             \n"   /*   out20 += a21 * b1l  */
            "vmla.f32	q10, q2, d0[1]            \n"   /*   out30 += a31 * b1l  */
            "vmla.f32	q12, q2, d1[0]            \n"   /*   out40 += a41 * b1l  */
            "vmla.f32	q14, q2, d1[1]            \n"   /*   out50 += a51 * b1l  */
            "vld1.32	{d4-d5}, [%[b_ptr] :128]! \n"   /* load b20-b23 to d4,d5 */
            "vmla.f32	q5, q3, d3[0]             \n"   /*   out01 += a01 * b1h  */
            "vmla.f32	q7, q3, d3[1]             \n"   /*   out11 += a11 * b1h  */
            "cmp  %[tails], #4                  \n"   /*    cmp tail with 4    */
            "vld1.32	{d2-d3}, [%[a_ptr] :64]!  \n"   /* load a02-a32 to d2,d3 */
            "vmla.f32	q9, q3, d0[0]             \n"   /*   out21 += a21 * b1h  */
            "vmla.f32	q11, q3, d0[1]            \n"   /*   out31 += a31 * b1h  */
            "vmla.f32	q13, q3, d1[0]            \n"   /*   out41 += a41 * b1h  */
            "vmla.f32	q15, q3, d1[1]            \n"   /*   out51 += a51 * b1h  */
            "vld1.32	{d6-d7}, [%[b_ptr] :128]! \n"   /* load b24-b27 to d6,d7 */
            "blt  5f                            \n"   /*   branch to tail == 3 */
            /* Tail Unroll 2 */
            "sub  %[tails], %[tails], #4        \n"   /*       tail -= 4       */
            "vld1.32	{d0-d1}, [%[a_ptr] :64]!  \n"   /* a42a52a03a13 to d0,d1 */
            "vmla.f32	q4, q2, d2[0]             \n"   /*   out00 += a02 * b2l  */
            "vmla.f32	q6, q2, d2[1]             \n"   /*   out10 += a12 * b2l  */
            "vmla.f32	q8, q2, d3[0]             \n"   /*   out20 += a22 * b2l  */
            "vmla.f32	q10, q2, d3[1]            \n"   /*   out30 += a32 * b2l  */
            "vmla.f32	q12, q2, d0[0]            \n"   /*   out40 += a42 * b2l  */
            "vmla.f32	q14, q2, d0[1]            \n"   /*   out50 += a52 * b2l  */
            "vld1.32	{d4-d5}, [%[b_ptr] :128]! \n"   /* load b30-b33 to d4,d5 */
            "vmla.f32	q5, q3, d2[0]             \n"   /*   out01 += a02 * b2h  */
            "vmla.f32	q7, q3, d2[1]             \n"   /*   out11 += a12 * b2h  */
            "vmla.f32	q9, q3, d3[0]             \n"   /*   out21 += a22 * b2h  */
            "vmla.f32	q11, q3, d3[1]            \n"   /*   out31 += a32 * b2h  */
            "vld1.32	{d2-d3}, [%[a_ptr] :64]!  \n"   /* load a23-a53 to d2,d3 */
            "vmla.f32	q13, q3, d0[0]            \n"   /*   out41 += a42 * b2h  */
            "vmla.f32	q15, q3, d0[1]            \n"   /*   out51 += a52 * b2h  */
            "vld1.32	{d6-d7}, [%[b_ptr] :128]! \n"   /* load b34-b37 to d6,d7 */
            /* Tail Unroll 3 */
            "vmla.f32	q4,  q2, d1[0]            \n"   /*   out00 += a03 * b3l  */
            "vmla.f32	q5,  q3, d1[0]            \n"   /*   out01 += a03 * b3h  */
            "vmla.f32	q6,  q2, d1[1]            \n"   /*   out10 += a13 * b3l  */
            "vmla.f32	q7,  q3, d1[1]            \n"   /*   out11 += a13 * b3h  */
            "vmla.f32	q8,  q2, d2[0]            \n"   /*   out20 += a23 * b3l  */
            "vmla.f32	q9,  q3, d2[0]            \n"   /*   out21 += a23 * b3h  */
            "vmla.f32	q10, q2, d2[1]            \n"   /*   out30 += a33 * b3l  */
            "vmla.f32	q11, q3, d2[1]            \n"   /*   out31 += a33 * b3h  */
            "vmla.f32	q12, q2, d3[0]            \n"   /*   out40 += a43 * b3l  */
            "vmla.f32	q13, q3, d3[0]            \n"   /*   out41 += a43 * b3h  */
            "vmla.f32	q14, q2, d3[1]            \n"   /*   out50 += a53 * b3l  */
            "vmla.f32	q15, q3, d3[1]            \n"   /*   out51 += a53 * b3h  */
            "b  2f                              \n"   /*  branch to check relu */
            /* tails==1 final tail */
            "3:\n"
            "vmov d2, r0, r1                    \n"   /*   mov b02, b03 to d2  */
            "add  %[b_ptr], %[b_ptr], #0x10     \n"   /*      bptr +=  16      */
            "vmla.f32	q4, q2, d0[0]             \n"   /*   out00 += a00 * b0l  */
            "add  %[a_ptr], %[a_ptr], #0x18     \n"   /*      aptr +=  24      */
            "vmla.f32	q6, q2, d0[1]             \n"   /*   out10 += a10 * b0l  */
            "vld1.32	{d6-d7}, [%[b_ptr] :128]! \n"   /* load b04-b07 to d6,d7 */
            "vmla.f32	q8,  q2, d1[0]            \n"   /*   out20 += a20 * b0l  */
            "vmla.f32	q10, q2, d1[1]            \n"   /*   out30 += a30 * b0l  */
            "vmla.f32	q12, q2, d2[0]            \n"   /*   out40 += a40 * b0l  */
            "vmla.f32	q14, q2, d2[1]            \n"   /*   out50 += a50 * b0l  */
            "vmla.f32	q5,  q3, d0[0]            \n"   /*   out01 += a00 * b0h  */
            "vmla.f32	q7,  q3, d0[1]            \n"   /*   out11 += a10 * b0h  */
            "vmla.f32	q9,  q3, d1[0]            \n"   /*   out21 += a20 * b0h  */
            "vmla.f32	q11, q3, d1[1]            \n"   /*   out31 += a30 * b0h  */
            "vmla.f32	q13, q3, d2[0]            \n"   /*   out41 += a40 * b0h  */
            "vmla.f32	q15, q3, d2[1]            \n"   /*   out51 += a50 * b0h  */
            "b  2f                              \n"   /*  branch to check relu */
            /* tails==2 final tail */
            "4:\n"
            "vmla.f32	q4,  q2, d3[0]            \n"   /*   out00 += a01 * b1l  */
            "vmla.f32	q5,  q3, d3[0]            \n"   /*   out01 += a01 * b1h  */
            "vmla.f32	q6,  q2, d3[1]            \n"   /*   out10 += a11 * b1l  */
            "vmla.f32	q7,  q3, d3[1]            \n"   /*   out11 += a11 * b1h  */
            "vmla.f32	q8,  q2, d0[0]            \n"   /*   out20 += a21 * b1l  */
            "vmla.f32	q9,  q3, d0[0]            \n"   /*   out21 += a21 * b1h  */
            "vmla.f32	q10, q2, d0[1]            \n"   /*   out30 += a31 * b1l  */
            "vmla.f32	q11, q3, d0[1]            \n"   /*   out31 += a31 * b1h  */
            "vmla.f32	q12, q2, d1[0]            \n"   /*   out40 += a41 * b1l  */
            "vmla.f32	q13, q3, d1[0]            \n"   /*   out41 += a41 * b1h  */
            "vmla.f32	q14, q2, d1[1]            \n"   /*   out50 += a51 * b1l  */
            "vmla.f32	q15, q3, d1[1]            \n"   /*   out51 += a51 * b1h  */
            "b  2f                              \n"   /*  branch to check relu */
            /* tails==3 final tail */
            "5:\n"
            "vmla.f32	q4,  q2, d2[0]            \n"   /*   out00 += a02 * b2l  */
            "vld1.32	{d0}, [%[a_ptr] :64]!     \n"   /*  load a42, a52 to d0  */
            "vmla.f32	q6,  q2, d2[1]            \n"   /*   out10 += a12 * b2l  */
            "vmla.f32	q8,  q2, d3[0]            \n"   /*   out20 += a22 * b2l  */
            "vmla.f32	q5,  q3, d2[0]            \n"   /*   out01 += a02 * b2h  */
            "vmla.f32	q7,  q3, d2[1]            \n"   /*   out11 += a12 * b2h  */
            "vmla.f32	q9,  q3, d3[0]            \n"   /*   out21 += a22 * b2h  */
            "vmla.f32	q10, q2, d3[1]            \n"   /*   out30 += a32 * b2l  */
            "vmla.f32	q11, q3, d3[1]            \n"   /*   out31 += a32 * b2h  */
            "vmla.f32	q12, q2, d0[0]            \n"   /*   out40 += a42 * b2l  */
            "vmla.f32	q13, q3, d0[0]            \n"   /*   out41 += a42 * b2h  */
            "vmla.f32	q14, q2, d0[1]            \n"   /*   out50 += a52 * b2l  */
            "vmla.f32	q15, q3, d0[1]            \n"   /*   out51 += a52 * b2h  */
            /* relu */
            "2:\n"
            "cmp %[tails], #1                   \n"   /*    cmp tail is relu   */
            "bne 0f                             \n"   /* no relu branch to end */
            "vmov.i32 q0,  #0                   \n"   /*     mov 0.f to q0     */
            "vmax.f32 q4,  q4,  q0              \n"   /*      out00 relu       */
            "vmax.f32 q5,  q5,  q0              \n"   /*      out01 relu       */
            "vmax.f32 q6,  q6,  q0              \n"   /*      out10 relu       */
            "vmax.f32 q7,  q7,  q0              \n"   /*      out11 relu       */
            "vmax.f32 q8,  q8,  q0              \n"   /*      out20 relu       */
            "vmax.f32 q9,  q9,  q0              \n"   /*      out21 relu       */
            "vmax.f32 q10, q10, q0              \n"   /*      out30 relu       */
            "vmax.f32 q11, q11, q0              \n"   /*      out31 relu       */
            "vmax.f32 q12, q12, q0              \n"   /*      out40 relu       */
            "vmax.f32 q13, q13, q0              \n"   /*      out41 relu       */
            "vmax.f32 q14, q14, q0              \n"   /*      out50 relu       */
            "vmax.f32 q15, q15, q0              \n"   /*      out51 relu       */
            "0:\n"
            "vst1.32  {d8-d11},   [%[c_ptr0]]!  \n"   /*  store out0 to cptr0  */
            "vst1.32  {d12-d15},  [%[c_ptr1]]!  \n"   /*  store out1 to cptr1  */
            "vst1.32  {d16-d19},  [%[c_ptr2]]!  \n"   /*  store out2 to cptr2  */
            "vst1.32  {d20-d23},  [%[c_ptr3]]!  \n"   /*  store out3 to cptr3  */
            "vst1.32  {d24-d27},  [%[c_ptr4]]!  \n"   /*  store out4 to cptr4  */
            "vst1.32  {d28-d31},  [%[c_ptr5]]!  \n"   /*  store out5 to cptr5  */
            : [a_ptr] "+r"(a_ptr),
              [b_ptr] "+r"(b_ptr),
              [c_ptr0] "+r"(c_ptr0),
              [c_ptr1] "+r"(c_ptr1),
              [c_ptr2] "+r"(c_ptr2),
              [c_ptr3] "+r"(c_ptr3),
              [c_ptr4] "+r"(c_ptr4),
              [c_ptr5] "+r"(c_ptr5),
              [k] "+r"(k),
              [tails] "+r"(tails)
            : [bias_ptr] "r"(bias_local)
            : "r0", "r1", "q0","q1","q2","q3","q4",
              "q5","q6","q7","q8","q9","q10","q11",
              "q12","q13","q14","q15","cc","memory");
        // clang-format on
        if (flag_p_remain && (xb == bblocks - 1)) {
          for (int i = 0; i < remain; ++i) {
            *pout0++ = cout0[i];
            *pout1++ = cout1[i];
            *pout2++ = cout2[i];
            *pout3++ = cout3[i];
            *pout4++ = cout4[i];
            *pout5++ = cout5[i];
          }
        }
      }
    }
  }
}

void sgemm_prepacked_4x8(bool is_transB,
                         int M,
                         int N,
                         int K,
                         const float* A_packed,
                         const float* B,
                         int ldb,
                         float beta,
                         float* C,
                         int ldc,
                         const float* bias,
                         bool has_bias,
                         const operators::ActivationParam act_param,
                         ARMContext* ctx) {
  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
  auto* workspace = ctx->workspace_data<float>();
  int threads = ctx->threads();
  auto act_type = act_param.active_type;
  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
  if (act_param.has_active) {
    if (act_type == lite_api::ActivationType::kRelu) {
      flag_act = 0x01;
    } else if (act_type == lite_api::ActivationType::kRelu6) {
      flag_act = 0x02;
      float local_alpha = act_param.Relu_clipped_coef;
      alpha[0] = local_alpha;
      alpha[1] = local_alpha;
      alpha[2] = local_alpha;
      alpha[3] = local_alpha;
    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
      flag_act = 0x03;
      float local_alpha = act_param.Leaky_relu_alpha;
      alpha[0] = local_alpha;
      alpha[1] = local_alpha;
      alpha[2] = local_alpha;
      alpha[3] = local_alpha;
    }
  }
  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
  int x_block =
      (l2_cache - (MBLOCK_A73 * K)) / (sizeof(float) * (K + MBLOCK_A73));
  x_block /= NBLOCK;
  x_block *= NBLOCK;
  int x_num = (N + (x_block - 1)) / x_block;
  x_block = (N + x_num - 1) / x_num;
  x_block = (x_block + NBLOCK - 1) / NBLOCK;
  x_block *= NBLOCK;
  x_block = x_block < NBLOCK ? NBLOCK : x_block;

  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
  int tail_pre = (K & (KBLOCK - 1));
  if (tail_pre == 0) {
    tail_pre = KBLOCK;
  }

  bool flag_p_remain = false;
  int remain = 0;

  int has_beta = fabsf(beta) > 1e-8f ? 1 : 0;

  //! apanel is pre_compute outside gemm
  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
    unsigned int xmax = x0 + x_block;
    if (xmax > N) {
      xmax = N;
    }
    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
    if (remain > 0) {
      flag_p_remain = true;
    }
    //! load bpanel
    auto b_pannel = static_cast<float*>(workspace);
    if (is_transB) {
      loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax);
    } else {
      loadb(b_pannel, B, ldb, 0, K, x0, xmax);
    }
#pragma omp parallel for num_threads(threads)
    for (unsigned int y = 0; y < M; y += MBLOCK_A73) {
      unsigned int ymax = y + MBLOCK_A73;
      if (ymax > M) {
        ymax = M;
      }

      float cout0[NBLOCK];
      float cout1[NBLOCK];
      float cout2[NBLOCK];
      float cout3[NBLOCK];

      float bias_local[4] = {0};
      if (has_bias) {
        bias_local[0] = bias[y];
        bias_local[1] = bias[y + 1];
        bias_local[2] = bias[y + 2];
        bias_local[3] = bias[y + 3];
      }

      float* c_ptr0 = C + y * ldc + x0;
      float* c_ptr1 = c_ptr0 + ldc;
      float* c_ptr2 = c_ptr1 + ldc;
      float* c_ptr3 = c_ptr2 + ldc;

      float* pout0 = c_ptr0;
      float* pout1 = c_ptr1;
      float* pout2 = c_ptr2;
      float* pout3 = c_ptr3;

      const float* a_ptr_l = A_packed + y * K;
      const float* b_ptr = b_pannel;
      for (int xb = 0; xb < bblocks; xb++) {
        if ((y + 3) >= ymax) {
          switch ((y + 3) - ymax) {
            case 2:
              c_ptr1 = cout1;
            case 1:
              c_ptr2 = cout1;
            case 0:
              c_ptr3 = cout1;
            default:
              break;
          }
        }
        if (flag_p_remain && (xb == bblocks - 1)) {
          pout0 = c_ptr0;
          pout1 = c_ptr1;
          pout2 = c_ptr2;
          pout3 = c_ptr3;

          c_ptr0 = cout0;
          c_ptr1 = cout1;
          c_ptr2 = cout2;
          c_ptr3 = cout3;

          if (has_beta) {
            for (int i = 0; i < remain; ++i) {
              cout0[i] = pout0[i];
              cout1[i] = pout1[i];
              cout2[i] = pout2[i];
              cout3[i] = pout3[i];
            }
          }
        }
        const float* a_ptr = a_ptr_l;
        int tails = tail_pre;
        int k = k_pre;
        // clang-format off
        asm volatile(
            "vld1.32    {d4-d5}, [%[bias_ptr]]      @ load bias\n"
            "vdup.32    q8, d4[0]                   @ add bias to out00\n"
            "pld [%[a_ptr]]                         @ preload a, 64byte\n"
            "vdup.32    q9, d4[0]                   @ add bias to out01\n"
            "pld [%[b_ptr]]                         @ preload b\n"
            "vdup.32    q10, d4[1]                  @ add bias to out10\n"
            "pld [%[a_ptr], #64]                    @ preload a\n"
            "vdup.32    q11, d4[1]                  @ add bias to out11\n"
            "vdup.32    q12, d5[0]                  @ add bias to out20\n"
            "pld [%[b_ptr], #64]                    @ preload b\n"
            "vdup.32    q13, d5[0]                  @ add bias to out21\n"
            "pld [%[a_ptr], #128]                   @ preload a\n"
            "vdup.32    q14, d5[1]                  @ add bias to out30\n"
            "pld [%[b_ptr], #128]                   @ preload b\n"
            "vdup.32    q15, d5[1]                  @ add bias to out31\n"
            "pld [%[b_ptr], #192]                   @ preload b\n"
            "cmp %[beta], #0\n"
            "beq    11f\n" /* check beta == 0? */
            /* process beta */
            "vdup.32    q4, %[beta]\n"          /* beta to vector */
            "vld1.32    {d0-d3}, [%[c_ptr0]]\n" /* load output r0 */
            "vld1.32    {d4-d7}, [%[c_ptr1]]\n" /* load output r1 */
            "vmla.f32   q8, q0, q4\n"           /* cr00 += beta * c_r00 */
            "vmla.f32   q9, q1, q4\n"           /* cr01 += beta * c_r01 */
            "vld1.32    {d0-d3}, [%[c_ptr2]]\n" /* load output r2 */
            "vmla.f32   q10, q2, q4\n"          /* cr10 += beta * c_r10 */
            "vmla.f32   q11, q3, q4\n"          /* cr11 += beta * c_r11 */
            "vld1.32    {d4-d7}, [%[c_ptr3]]\n" /* load output r3 */
            "vmla.f32   q12, q0, q4\n"          /* cr20 += beta * c_r20 */
            "vmla.f32   q13, q1, q4\n"          /* cr21 += beta * c_r21 */
            "vmla.f32   q14, q2, q4\n"          /* cr30 += beta * c_r30 */
            "vmla.f32   q15, q3, q4\n"          /* cr31 += beta * c_r31 */
            "11: \n"                            /* check loop count */
            "vld1.32	{d0-d3}, [%[a_ptr] :128]!   @ load a0~a3\n"
            "vld1.32   {d8-d11}, [%[b_ptr] :128]! @ load b1\n"
            "cmp %[k], #0                         @ check weather k is bigger than "
            "0\n"
            "beq 0f                               @ jump to tail\n"
            "1:                                   @ main loop for k\n"
            /* Unroll 0*/
            "vld1.32  {d12-d15}, [%[b_ptr] :128]! @ load next b1, b2\n"
            "vmla.f32	q8, q4, d0[0]               @ out0 += b1 * a0\n"
            "vld1.32	{d4-d7}, [%[a_ptr] :128]!   @ load next 2xa0~a3\n"
            "vmla.f32	q10, q4, d0[1]              @ out1 += b1 * a1\n"
            "vmla.f32	q12, q4, d1[0]              @ out2 += b1 * a2\n"
            "vmla.f32	q14, q4, d1[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q9, q5, d0[0]               @ out4 += b2 * a0\n"
            "vmla.f32	q11, q5, d0[1]              @ out5 += b2 * a1\n"
            "vmla.f32	q13, q5, d1[0]              @ out6 += b2 * a2\n"
            "vmla.f32	q15, q5, d1[1]              @ out7 += b2 * a3\n"
            "vld1.32	{d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
            /* Unroll 1 */
            "vmla.f32	q8, q6, d2[0]               @ out0 += b1 * a0\n"
            "pld [%[b_ptr], #64]                    @ preload b\n"
            "vmla.f32	q10, q6, d2[1]              @ out1 += b1 * a1\n"
            "vmla.f32	q12, q6, d3[0]              @ out2 += b1 * a2\n"
            "vmla.f32	q14, q6, d3[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q9, q7, d2[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q11, q7, d2[1]              @ out7 += b2 * a1\n"
            "vmla.f32	q13, q7, d3[0]              @ out8 += b2 * a2\n"
            "vmla.f32	q15, q7, d3[1]              @ out9 += b2 * a3\n"
            "vld1.32	{d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n"
            /* Unroll 2 */
            "vmla.f32	q8, q4, d4[0]               @ out0 += b1 * a0\n"
            "vld1.32	{d0-d3}, [%[a_ptr] :128]!   @ load next a0~a3\n"
            "vmla.f32	q10, q4, d4[1]              @ out1 += b1 * a1\n"
            "vmla.f32	q12, q4, d5[0]              @ out2 += b1 * a2\n"
            "vmla.f32	q14, q4, d5[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q9, q5, d4[0]               @ out4 += b2 * a0\n"
            "vmla.f32	q11, q5, d4[1]              @ out5 += b2 * a1\n"
            "vmla.f32	q13, q5, d5[0]              @ out6 += b2 * a2\n"
            "vmla.f32	q15, q5, d5[1]              @ out7 += b2 * a3\n"
            "vld1.32	{d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
            /* Unroll 3 */
            "vmla.f32	q8, q6, d6[0]               @ out0 += b1 * a0\n"
            "pld [%[a_ptr], #64]                    @ preload a\n"
            "vmla.f32	q10, q6, d6[1]              @ out1 += b1 * a1\n"
            "vmla.f32	q12, q6, d7[0]              @ out2 += b1 * a2\n"
            "vmla.f32	q14, q6, d7[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q9, q7, d6[0]               @ out4 += b2 * a0\n"
            "vmla.f32	q11, q7, d6[1]              @ out5 += b2 * a1\n"
            "vmla.f32	q13, q7, d7[0]              @ out6 += b2 * a2\n"
            "vmla.f32	q15, q7, d7[1]              @ out7 += b2 * a3\n"
            "subs		%[k], %[k], #1              @ k--\n"
            "bne		1b                          @ jump to main loop\n"
            "0:                                     @ process tail\n"
            "subs		%[tails], %[tails], #1      @ tail--\n"
            "beq		3f                          @ jump to tail = 1\n"
            /* Unroll 0*/
            "vld1.32  {d12-d15}, [%[b_ptr] :128]!   @ load next b1, b2\n"
            "vmla.f32	q8, q4, d0[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q10, q4, d0[1]              @ out1 += b1 * a1\n"
            "subs		%[tails], %[tails], #1      @ tail--\n"
            "vmla.f32	q12, q4, d1[0]              @ out2 += b1 * a2\n"
            "vmla.f32	q14, q4, d1[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q9, q5, d0[0]               @ out4 += b2 * a0\n"
            "vmla.f32	q11, q5, d0[1]              @ out5 += b2 * a1\n"
            "vmla.f32	q13, q5, d1[0]              @ out6 += b2 * a2\n"
            "vmla.f32	q15, q5, d1[1]              @ out7 += b2 * a3\n"
            "beq		4f                          @ jump to tail==2\n"
            /* Unroll 1 */
            "vld1.32	{d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
            "vmla.f32	q8, q6, d2[0]               @ out0 += b1 * a0\n"
            "vld1.32	{d4-d7}, [%[a_ptr] :128]!   @ load next 2xa0~a3\n"
            "vmla.f32	q10, q6, d2[1]              @ out1 += b1 * a1\n"
            "subs		%[tails], %[tails], #1      @ tail--\n"
            "vmla.f32	q12, q6, d3[0]              @ out2 += b1 * a2\n"
            "vmla.f32	q14, q6, d3[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q9, q7, d2[0]               @ out6 += b2 * a0\n"
            "vmla.f32	q11, q7, d2[1]              @ out7 += b2 * a1\n"
            "vmla.f32	q13, q7, d3[0]              @ out8 += b2 * a2\n"
            "vmla.f32	q15, q7, d3[1]              @ out9 += b2 * a3\n"
            "beq		5f                          @ jump to tail==3\n"
            /* Unroll 2 */
            "vld1.32	{d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n"
            "vmla.f32	q8, q4, d4[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q10, q4, d4[1]              @ out1 += b1 * a1\n"
            "vmla.f32	q12, q4, d5[0]              @ out2 += b1 * a2\n"
            "vmla.f32	q14, q4, d5[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q9, q5, d4[0]               @ out4 += b2 * a0\n"
            "vmla.f32	q11, q5, d4[1]              @ out5 += b2 * a1\n"
            "vmla.f32	q13, q5, d5[0]              @ out6 += b2 * a2\n"
            "vmla.f32	q15, q5, d5[1]              @ out7 += b2 * a3\n"
            /* Unroll 3 */
            "vmla.f32	q8, q6, d6[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q10, q6, d6[1]              @ out1 += b1 * a1\n"
            "vmla.f32	q12, q6, d7[0]              @ out2 += b1 * a2\n"
            "vmla.f32	q14, q6, d7[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q9, q7, d6[0]               @ out4 += b2 * a0\n"
            "vmla.f32	q11, q7, d6[1]              @ out5 += b2 * a1\n"
            "vmla.f32	q13, q7, d7[0]              @ out6 += b2 * a2\n"
            "vmla.f32	q15, q7, d7[1]              @ out7 += b2 * a3\n"
            "b		2f\n"
            /* tails==1 final tail */
            "3:                                     @ tail=1\n"
            "vmla.f32	q8, q4, d0[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q10, q4, d0[1]              @ out1 += b1 * a1\n"
            "vmla.f32	q12, q4, d1[0]              @ out2 += b1 * a2\n"
            "vmla.f32	q14, q4, d1[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q9, q5, d0[0]               @ out4 += b2 * a0\n"
            "vmla.f32	q11, q5, d0[1]              @ out5 += b2 * a1\n"
            "vmla.f32	q13, q5, d1[0]              @ out6 += b2 * a2\n"
            "vmla.f32	q15, q5, d1[1]              @ out7 += b2 * a3\n"
            /*aptr - 16 */
            "sub		%[a_ptr], %[a_ptr], #16     @ tail--\n"
            "b		2f                              @ jump to end\n"
            /* tails==2 final tail*/
            "4:                                     @ tail == 2\n"
            "vmla.f32	q8, q6, d2[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q10, q6, d2[1]              @ out1 += b1 * a1\n"
            "vmla.f32	q12, q6, d3[0]              @ out2 += b1 * a2\n"
            "vmla.f32	q14, q6, d3[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q9, q7, d2[0]               @ out4 += b2 * a0\n"
            "vmla.f32	q11, q7, d2[1]              @ out5 += b2 * a1\n"
            "vmla.f32	q13, q7, d3[0]              @ out6 += b2 * a2\n"
            "vmla.f32	q15, q7, d3[1]              @ out7 += b2 * a3\n"
            "b		2f                              @ jump to end\n"
            /* tails==3 final tail*/
            "5:                                     @ tail=3\n"
            "vmla.f32	q8, q4, d4[0]               @ out0 += b1 * a0\n"
            "vmla.f32	q10, q4, d4[1]              @ out1 += b1 * a1\n"
            "vmla.f32	q12, q4, d5[0]              @ out2 += b1 * a2\n"
            "vmla.f32	q14, q4, d5[1]              @ out3 += b1 * a3\n"
            "vmla.f32	q9, q5, d4[0]               @ out4 += b2 * a0\n"
            "vmla.f32	q11, q5, d4[1]              @ out5 += b2 * a1\n"
            "vmla.f32	q13, q5, d5[0]              @ out6 += b2 * a2\n"
            "vmla.f32	q15, q5, d5[1]              @ out7 += b2 * a3\n"
            /*aptr - 16*/
            "sub		%[a_ptr], %[a_ptr], #16       @ tail--\n"
            "2:                                   @ check relu\n"
            //!   relu
            "cmp        %[flag_act], #1           @ check if has relu\n"
            "bne        6f                        @ jump if not relu \n"
            "vmov.u32   q0, #0                    @ for relu\n"
            "vmax.f32   q8, q8, q0                @ for relu\n"
            "vmax.f32   q9, q9, q0                @ for relu\n"
            "vmax.f32   q10, q10, q0              @ for relu\n"
            "vmax.f32   q11, q11, q0              @ for relu\n"
            "vmax.f32   q12, q12, q0              @ for relu\n"
            "vmax.f32   q13, q13, q0              @ for relu\n"
            "vmax.f32   q14, q14, q0              @ for relu\n"
            "vmax.f32   q15, q15, q0              @ for relu\n"
            "b          10f                       @ relu end\n"
            "6:                                   @ no relu \n"
            "cmp        %[flag_act], #0           @ check no act\n"
            "beq        10f                       @ no act end  \n"
            //!   relu6
            "cmp        %[flag_act], #2           @ check if has relu6\n"
            "bne        7f                        @ jump if no relu6 \n"
            "vmov.u32   q0, #0                    @ for relu6\n"
            "vld1.f32   {d2-d3}, [%[alpha]]       @ load relu6 alpha\n"
            "vmax.f32   q8, q8, q0                @ for relu6\n"
            "vmax.f32   q9, q9, q0                @ for relu6\n"
            "vmax.f32   q10, q10, q0              @ for relu6\n"
            "vmax.f32   q11, q11, q0              @ for relu6\n"
            "vmax.f32   q12, q12, q0              @ for relu6\n"
            "vmax.f32   q13, q13, q0              @ for relu6\n"
            "vmax.f32   q14, q14, q0              @ for relu6\n"
            "vmax.f32   q15, q15, q0              @ for relu6\n"

            "vmin.f32   q8, q8, q1                @ for relu6\n"
            "vmin.f32   q9, q9, q1                @ for relu6\n"
            "vmin.f32   q10, q10, q1              @ for relu6\n"
            "vmin.f32   q11, q11, q1              @ for relu6\n"
            "vmin.f32   q12, q12, q1              @ for relu6\n"
            "vmin.f32   q13, q13, q1              @ for relu6\n"
            "vmin.f32   q14, q14, q1              @ for relu6\n"
            "vmin.f32   q15, q15, q1              @ for relu6\n"
            "b          10f                       @ relu6 end \n"
            //! leakey relu
            "7:                                   @ otherwise is leakey relu\n"
            "vmov.u32   q0,   #0                  @ for leakey relu \n"
            "vld1.f32   {d2-d3}, [%[alpha]]       @ load leakey relu alpha\n"
            "vcge.f32   q2, q8, q0                @ vcgeq_u32 \n"
            "vmul.f32   q3, q8, q1                @ vmulq_f32 \n"
            "vbif       q8, q3, q2                @ choose    \n"
            "vcge.f32   q2, q9, q0                @ vcgeq_u32 \n"
            "vmul.f32   q3, q9, q1                @ vmulq_f32 \n"
            "vbif       q9, q3, q2                @ choose    \n"
            "vcge.f32   q2, q10, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q10, q1               @ vmulq_f32 \n"
            "vbif       q10, q3, q2               @ choose    \n"
            "vcge.f32   q2, q11, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q11, q1               @ vmulq_f32 \n"
            "vbif       q11, q3, q2               @ choose    \n"
            "vcge.f32   q2, q12, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q12, q1               @ vmulq_f32 \n"
            "vbif       q12, q3, q2               @ choose    \n"
            "vcge.f32   q2, q13, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q13, q1               @ vmulq_f32 \n"
            "vbif       q13, q3, q2               @ choose    \n"
            "vcge.f32   q2, q14, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q14, q1               @ vmulq_f32 \n"
            "vbif       q14, q3, q2               @ choose    \n"
            "vcge.f32   q2, q15, q0               @ vcgeq_u32 \n"
            "vmul.f32   q3, q15, q1               @ vmulq_f32 \n"
            "vbif       q15, q3, q2               @ choose    \n"
            "10:                                  @ act end  \n"
            "vst1.32    {d16-d19},  [%[c_ptr0]]!    @ store r0\n"
            "vst1.32    {d20-d23},  [%[c_ptr1]]!    @ store r1\n"
            "vst1.32    {d24-d27},  [%[c_ptr2]]!    @ store r2\n"
            "vst1.32    {d28-d31},  [%[c_ptr3]]!    @ store r3\n"
            : [a_ptr] "+r"(a_ptr),
              [b_ptr] "+r"(b_ptr),
              [c_ptr0] "+r"(c_ptr0),
              [c_ptr1] "+r"(c_ptr1),
              [c_ptr2] "+r"(c_ptr2),
              [c_ptr3] "+r"(c_ptr3),
              [k] "+r"(k),
              [tails] "+r"(tails)
            : [bias_ptr] "r"(bias_local),
              [beta] "r"(beta),
              [alpha] "r"(alpha),
              [flag_act] "r"(flag_act)
            : "q0","q1","q2","q3",
              "q4","q5","q6","q7","q8","q9","q10",
              "q11","q12","q13","q14","q15","cc","memory");
        // clang-format on
        if (flag_p_remain && (xb == bblocks - 1)) {
          for (int i = 0; i < remain; ++i) {
            *pout0++ = cout0[i];
            *pout1++ = cout1[i];
            *pout2++ = cout2[i];
            *pout3++ = cout3[i];
          }
        }
      }
    }
  }
}
#endif  // __aarch64__

}  // namespace math
}  // namespace arm
}  // namespace lite
}  // namespace paddle