55709708 · 55709708 · 55709708 · 55709708 · 55709708 · 55709708
21 changed file
--- a/.github/workflows/linux-aarch64-cpu-gcc.yml
+++ b/.github/workflows/linux-aarch64-cpu-gcc.yml
@@ -155,7 +155,7 @@ jobs:
      uses: actions/cache@v3
      with:
        path: qemu-install
-        key: qemu-aarch64-install-20220502-2
+        key: qemu-aarch64-install-20230717
    - name: install-qemu-build-deps
      if: steps.cache-qemu.outputs.cache-hit != 'true'
      run: |
@@ -167,7 +167,7 @@ jobs:
      with:
        repository: qemu/qemu
        path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
+        ref: ed8ad9728a9c0eec34db9dff61dfa2f1dd625637
    - name: qemu
      if: steps.cache-qemu.outputs.cache-hit != 'true'
      run: |

--- a/docs/how-to-build/how-to-build.md
+++ b/docs/how-to-build/how-to-build.md
@@ -147,14 +147,16 @@ Download and Install Visual Studio Community 2017 from https://visualstudio.micr

 Start the command prompt: `Start → Programs → Visual Studio 2017 → Visual Studio Tools → x64 Native Tools Command Prompt for VS 2017`

+> You can also search `x64 Native Tools Command Prompt for VS 2017` directly.
+
 Download protobuf-3.11.2 from https://github.com/google/protobuf/archive/v3.11.2.zip

 Build protobuf library:

 ```shell
 cd <protobuf-root-dir>
-mkdir build
-cd build
+mkdir protobuf_build
+cd protobuf_build
 cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
 cmake --build . --config Release -j 2
 cmake --build . --config Release --target install

--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -1028,8 +1028,14 @@ ncnn_layer_t ncnn_layer_create()

 ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex)
 {
+    void* pthis = (void*)(ncnn::create_layer(typeindex));
+    if (!pthis)
+    {
+        return 0;
+    }
+
    ncnn_layer_t layer = (ncnn_layer_t)malloc(sizeof(__ncnn_layer_t));
-    layer->pthis = (void*)(ncnn::create_layer(typeindex));
+    layer->pthis = pthis;
    layer->load_param = __ncnn_layer_load_param;
    layer->load_model = __ncnn_layer_load_model;
    layer->create_pipeline = __ncnn_layer_create_pipeline;
@@ -1044,8 +1050,14 @@ ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex)
 #if NCNN_STRING
 ncnn_layer_t ncnn_layer_create_by_type(const char* type)
 {
+    void* pthis = (void*)(ncnn::create_layer(type));
+    if (!pthis)
+    {
+        return 0;
+    }
+
    ncnn_layer_t layer = (ncnn_layer_t)malloc(sizeof(__ncnn_layer_t));
-    layer->pthis = (void*)(ncnn::create_layer(type));
+    layer->pthis = pthis;
    layer->load_param = __ncnn_layer_load_param;
    layer->load_model = __ncnn_layer_load_model;
    layer->create_pipeline = __ncnn_layer_create_pipeline;
@@ -1056,6 +1068,11 @@ ncnn_layer_t ncnn_layer_create_by_type(const char* type)
    layer->forward_inplace_n = __ncnn_layer_forward_inplace_n;
    return layer;
 }
+
+int ncnn_layer_type_to_index(const char* type)
+{
+    return ncnn::layer_to_index(type);
+}
 #endif /* NCNN_STRING */

 void ncnn_layer_destroy(ncnn_layer_t layer)
@@ -1417,6 +1434,30 @@ int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat
    return ret;
 }

+void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt)
+{
+    const Option _opt = opt ? *((const Option*)opt) : Option();
+    copy_make_border(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, type, v, _opt);
+}
+
+void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt)
+{
+    const Option _opt = opt ? *((const Option*)opt) : Option();
+    copy_make_border_3d(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, front, behind, type, v, _opt);
+}
+
+void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt)
+{
+    const Option _opt = opt ? *((const Option*)opt) : Option();
+    copy_cut_border(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, _opt);
+}
+
+void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt)
+{
+    const Option _opt = opt ? *((const Option*)opt) : Option();
+    copy_cut_border_3d(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, front, behind, _opt);
+}
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif

--- a/src/c_api.h
+++ b/src/c_api.h
@@ -210,6 +210,7 @@ NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
 NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
 #if NCNN_STRING
 NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+NCNN_EXPORT int ncnn_layer_type_to_index(const char* type);
 #endif /* NCNN_STRING */
 NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);

@@ -327,6 +328,16 @@ NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, nc
 NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
 NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);

+/* mat process api */
+#define NCNN_BORDER_CONSTANT    0
+#define NCNN_BORDER_REPLICATE   1
+#define NCNN_BORDER_REFLECT     2
+#define NCNN_BORDER_TRANSPARENT -233
+NCNN_EXPORT void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif

--- a/src/layer/arm/convolution_3x3_winograd.h
+++ b/src/layer/arm/convolution_3x3_winograd.h
@@ -6302,9 +6302,9 @@ static inline void conv3x3s1_winograd43_transform_input_tile(const Mat& bottom_b
                        float32x4x2_t _t01 = vzipq_f32(_t0, _t1);

                        _r0 = vget_low_f32(_t01.val[0]);
-                        if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
-                        if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
-                        if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
+                        if (tj * 4 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
+                        if (tj * 4 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
+                        if (tj * 4 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
                        if (tj * 4 + 4 < w)
                        {
                            float tmp[2] = {r0[4], r1[4]};
@@ -8081,9 +8081,9 @@ static inline void conv3x3s1_winograd63_transform_input_tile(const Mat& bottom_b
                        float32x4x2_t _t01 = vzipq_f32(_t0, _t1);

                        _r0 = vget_low_f32(_t01.val[0]);
-                        if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
-                        if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
-                        if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
+                        if (tj * 6 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
+                        if (tj * 6 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
+                        if (tj * 6 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
                        if (tj * 6 + 4 < w)
                        {
                            _t0 = vld1q_f32(r0 + 4);

--- a/src/layer/arm/convolution_3x3_winograd_bf16s.h
+++ b/src/layer/arm/convolution_3x3_winograd_bf16s.h
@@ -1540,9 +1540,9 @@ static inline void conv3x3s1_winograd43_transform_input_tile_bf16s(const Mat& bo
                        float32x4_t _t1_fp32 = bfloat2float(_t01.val[1]);

                        _r0 = vget_low_f32(_t0_fp32);
-                        if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
-                        if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
-                        if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
+                        if (tj * 4 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
+                        if (tj * 4 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
+                        if (tj * 4 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
                        if (tj * 4 + 4 < w)
                        {
                            float tmp[2] = {bfloat16_to_float32(r0[4]), bfloat16_to_float32(r1[4])};
@@ -3211,9 +3211,9 @@ static inline void conv3x3s1_winograd63_transform_input_tile_bf16s(const Mat& bo
                        float32x4_t _t1_fp32 = bfloat2float(_t01.val[1]);

                        _r0 = vget_low_f32(_t0_fp32);
-                        if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
-                        if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
-                        if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
+                        if (tj * 6 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
+                        if (tj * 6 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
+                        if (tj * 6 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
                        if (tj * 6 + 4 < w)
                        {
                            _t0 = vld1_u16(r0 + 4);

--- a/src/layer/x86/convolution_int8.h
+++ b/src/layer/x86/convolution_int8.h
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
-{
-    int w = bottom_blob.w;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    const int maxk = kernel_w * kernel_h;
-
-    // kernel offsets
-    std::vector<int> _space_ofs(maxk);
-    int* space_ofs = &_space_ofs[0];
-    {
-        int p1 = 0;
-        int p2 = 0;
-        int gap = w * dilation_h - kernel_w * dilation_w;
-        for (int i = 0; i < kernel_h; i++)
-        {
-            for (int j = 0; j < kernel_w; j++)
-            {
-                space_ofs[p1] = p2;
-                p1++;
-                p2 += dilation_w;
-            }
-            p2 += gap;
-        }
-    }
-
-    // num_output
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < outch; p++)
-    {
-        int* outptr = top_blob.channel(p);
-
-        for (int i = 0; i < outh; i++)
-        {
-            for (int j = 0; j < outw; j++)
-            {
-                int sum = 0;
-
-                //                 const signed char* kptr = weight_data_int8.channel(p);
-                const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p;
-
-                // channels
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob.channel(q);
-                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;
-
-                    for (int k = 0; k < maxk; k++)
-                    {
-                        signed char val = sptr[space_ofs[k]];
-                        signed char w = kptr[k];
-                        sum += val * w;
-                    }
-
-                    kptr += maxk;
-                }
-
-                outptr[j] = sum;
-            }
-
-            outptr += outw;
-        }
-    }
-}
--- a/src/layer/x86/convolution_pack1to4_int8.h
+++ b/src/layer/x86/convolution_pack1to4_int8.h
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-static void convolution_pack1to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
-{
-    int w = bottom_blob.w;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    const int maxk = kernel_w * kernel_h;
-
-    // kernel offsets
-    std::vector<int> _space_ofs(maxk);
-    int* space_ofs = &_space_ofs[0];
-    {
-        int p1 = 0;
-        int p2 = 0;
-        int gap = w * dilation_h - kernel_w * dilation_w;
-        for (int i = 0; i < kernel_h; i++)
-        {
-            for (int j = 0; j < kernel_w; j++)
-            {
-                space_ofs[p1] = p2;
-                p1++;
-                p2 += dilation_w;
-            }
-            p2 += gap;
-        }
-    }
-
-    // num_output
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < outch; p++)
-    {
-        int* outptr = top_blob.channel(p);
-
-        for (int i = 0; i < outh; i++)
-        {
-            for (int j = 0; j < outw; j++)
-            {
-                __m128i _sum0 = _mm_setzero_si128();
-
-                const signed char* kptr = weight_data_int8.channel(p);
-
-                // channels
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob.channel(q);
-                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
-
-                    for (int k = 0; k < maxk; k++)
-                    {
-                        __m128i _val = _mm_set1_epi16((short)sptr[space_ofs[k]]);
-
-                        // TODO use _mm_cvtepi8_epi16 on sse4.1
-                        __m128i _w = _mm_loadl_epi64((const __m128i*)kptr);
-                        _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w));
-
-                        __m128i _sl = _mm_mullo_epi16(_val, _w);
-                        __m128i _sh = _mm_mulhi_epi16(_val, _w);
-                        __m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
-
-                        _sum0 = _mm_add_epi32(_sum0, _s0);
-
-                        kptr += 4;
-                    }
-                }
-
-                _mm_storeu_si128((__m128i*)(outptr + j * 4), _sum0);
-            }
-
-            outptr += outw * 4;
-        }
-    }
-}
--- a/src/layer/x86/convolution_pack8to1_int8.h
+++ b/src/layer/x86/convolution_pack8to1_int8.h
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-static void convolution_pack8to1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
-{
-    int w = bottom_blob.w;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    const int maxk = kernel_w * kernel_h;
-
-    // kernel offsets
-    std::vector<int> _space_ofs(maxk);
-    int* space_ofs = &_space_ofs[0];
-    {
-        int p1 = 0;
-        int p2 = 0;
-        int gap = w * dilation_h - kernel_w * dilation_w;
-        for (int i = 0; i < kernel_h; i++)
-        {
-            for (int j = 0; j < kernel_w; j++)
-            {
-                space_ofs[p1] = p2;
-                p1++;
-                p2 += dilation_w;
-            }
-            p2 += gap;
-        }
-    }
-
-    // num_output
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < outch; p++)
-    {
-        int* outptr = top_blob.channel(p);
-
-        for (int i = 0; i < outh; i++)
-        {
-            for (int j = 0; j < outw; j++)
-            {
-                int sum = 0;
-
-                const signed char* kptr = weight_data_int8.channel(p);
-
-                // channels
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob.channel(q);
-                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
-
-                    for (int k = 0; k < maxk; k++)
-                    {
-                        // TODO use _mm_cvtepi8_epi16 on sse4.1
-                        __m128i _val = _mm_loadl_epi64((const __m128i*)(sptr + space_ofs[k] * 8));
-                        _val = _mm_unpacklo_epi8(_val, _mm_cmpgt_epi8(_mm_setzero_si128(), _val));
-
-                        __m128i _w = _mm_loadl_epi64((const __m128i*)kptr);
-                        _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w));
-
-                        __m128i _sl = _mm_mullo_epi16(_val, _w);
-                        __m128i _sh = _mm_mulhi_epi16(_val, _w);
-                        __m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
-                        __m128i _s1 = _mm_unpackhi_epi16(_sl, _sh);
-
-                        __m128i _s4 = _mm_add_epi32(_s0, _s1);
-
-                        // TODO use _mm_hadd_epi32 on ssse3
-                        int s4[4];
-                        _mm_storeu_si128((__m128i*)s4, _s4);
-                        sum += s4[0] + s4[1] + s4[2] + s4[3]; // dot
-
-                        kptr += 8;
-                    }
-                }
-
-                outptr[j] = sum;
-            }
-
-            outptr += outw;
-        }
-    }
-}
--- a/src/layer/x86/convolution_pack8to4_int8.h
+++ b/src/layer/x86/convolution_pack8to4_int8.h
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-static void convolution_pack8to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
-{
-    int w = bottom_blob.w;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    const int maxk = kernel_w * kernel_h;
-
-    // kernel offsets
-    std::vector<int> _space_ofs(maxk);
-    int* space_ofs = &_space_ofs[0];
-    {
-        int p1 = 0;
-        int p2 = 0;
-        int gap = w * dilation_h - kernel_w * dilation_w;
-        for (int i = 0; i < kernel_h; i++)
-        {
-            for (int j = 0; j < kernel_w; j++)
-            {
-                space_ofs[p1] = p2;
-                p1++;
-                p2 += dilation_w;
-            }
-            p2 += gap;
-        }
-    }
-
-    // num_output
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < outch; p++)
-    {
-        int* outptr = top_blob.channel(p);
-
-        for (int i = 0; i < outh; i++)
-        {
-            for (int j = 0; j < outw; j++)
-            {
-                __m128i _sum0 = _mm_setzero_si128();
-                __m128i _sum1 = _mm_setzero_si128();
-                __m128i _sum2 = _mm_setzero_si128();
-                __m128i _sum3 = _mm_setzero_si128();
-
-                const signed char* kptr = weight_data_int8.channel(p);
-
-                // channels
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob.channel(q);
-                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w * 8;
-
-                    for (int k = 0; k < maxk; k++)
-                    {
-                        // TODO use _mm_cvtepi8_epi16 on sse4.1
-                        __m128i _val = _mm_loadl_epi64((const __m128i*)(sptr + space_ofs[k] * 8));
-                        _val = _mm_unpacklo_epi8(_val, _mm_cmpgt_epi8(_mm_setzero_si128(), _val));
-
-                        // TODO use _mm_cvtepi8_epi16 on sse4.1
-                        __m128i _w01 = _mm_loadu_si128((const __m128i*)kptr);
-                        __m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr + 16));
-                        __m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01);
-                        __m128i _extw23 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w23);
-                        __m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
-                        __m128i _w1 = _mm_unpackhi_epi8(_w01, _extw01);
-                        __m128i _w2 = _mm_unpacklo_epi8(_w23, _extw23);
-                        __m128i _w3 = _mm_unpackhi_epi8(_w23, _extw23);
-
-                        __m128i _sl0 = _mm_mullo_epi16(_val, _w0);
-                        __m128i _sh0 = _mm_mulhi_epi16(_val, _w0);
-                        __m128i _sl1 = _mm_mullo_epi16(_val, _w1);
-                        __m128i _sh1 = _mm_mulhi_epi16(_val, _w1);
-                        __m128i _sl2 = _mm_mullo_epi16(_val, _w2);
-                        __m128i _sh2 = _mm_mulhi_epi16(_val, _w2);
-                        __m128i _sl3 = _mm_mullo_epi16(_val, _w3);
-                        __m128i _sh3 = _mm_mulhi_epi16(_val, _w3);
-
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpacklo_epi16(_sl1, _sh1));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl2, _sh2));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpacklo_epi16(_sl3, _sh3));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpackhi_epi16(_sl0, _sh0));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl1, _sh1));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpackhi_epi16(_sl2, _sh2));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl3, _sh3));
-
-                        kptr += 32;
-                    }
-                }
-
-                // transpose 4x4
-                {
-                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
-                    _tmp0 = _mm_unpacklo_epi32(_sum0, _sum1);
-                    _tmp1 = _mm_unpacklo_epi32(_sum2, _sum3);
-                    _tmp2 = _mm_unpackhi_epi32(_sum0, _sum1);
-                    _tmp3 = _mm_unpackhi_epi32(_sum2, _sum3);
-                    _sum0 = _mm_unpacklo_epi64(_tmp0, _tmp1);
-                    _sum1 = _mm_unpackhi_epi64(_tmp0, _tmp1);
-                    _sum2 = _mm_unpacklo_epi64(_tmp2, _tmp3);
-                    _sum3 = _mm_unpackhi_epi64(_tmp2, _tmp3);
-                }
-
-                _sum0 = _mm_add_epi32(_sum0, _sum1);
-                _sum2 = _mm_add_epi32(_sum2, _sum3);
-
-                _sum0 = _mm_add_epi32(_sum0, _sum2);
-
-                _mm_storeu_si128((__m128i*)(outptr + j * 4), _sum0);
-            }
-
-            outptr += outw * 4;
-        }
-    }
-}
--- a/src/layer/x86/convolution_packed_int8.h
+++ b/src/layer/x86/convolution_packed_int8.h
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -45,16 +45,14 @@ namespace ncnn {
 #include "convolution_sgemm_int8.h"
 #include "convolution_1x1_int8.h"
 #include "convolution_3x3_int8.h"
-#include "convolution_int8.h"
+
+#include "convolution_packed_int8.h"
 #endif // NCNN_INT8

 #if __SSE2__
 #include "convolution_3x3_pack1to4.h"

 #if NCNN_INT8
-#include "convolution_pack8to4_int8.h"
-#include "convolution_pack1to4_int8.h"
-#include "convolution_pack8to1_int8.h"
 #include "convolution_sgemm_pack8to4_int8.h"
 #include "convolution_sgemm_pack1to4_int8.h"
 #include "convolution_sgemm_pack8to1_int8.h"
@@ -1237,42 +1235,6 @@ int Convolution_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
 }

 #if NCNN_INT8
-static void convolution_transform_kernel_packed_int8_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
-{
-    const int maxk = kernel_w * kernel_h;
-
-    // src = kw-kh-inch-outch
-    // dst = pa-pb-kw-kh-inch/pa-outch/pb
-    {
-        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
-
-        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);
-
-        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
-        {
-            signed char* g00 = weight_data_tm.channel(q / out_elempack);
-
-            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
-            {
-                for (int k = 0; k < maxk; k++)
-                {
-                    for (int i = 0; i < out_elempack; i++)
-                    {
-                        for (int j = 0; j < elempack; j++)
-                        {
-                            const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);
-
-                            g00[0] = k00[k];
-
-                            g00++;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
 int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
 {
    const int maxk = kernel_w * kernel_h;
@@ -1309,7 +1271,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
        }
        else
        {
-            convolution_transform_kernel_packed_int8_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+            convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
        }
    }

@@ -1341,7 +1303,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
        }
        else
        {
-            convolution_transform_kernel_packed_int8_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+            convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
        }
    }

@@ -1365,7 +1327,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
        }
        else
        {
-            convolution_transform_kernel_packed_int8_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+            convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
        }
    }
 #endif // __SSE2__
@@ -1391,7 +1353,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
        }
        else
        {
-            weight_data_tm = weight_data;
+            convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
        }
    }

@@ -1501,7 +1463,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
        }
        else
        {
-            convolution_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+            convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }

@@ -1533,7 +1495,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
        }
        else
        {
-            convolution_pack1to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+            convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }

@@ -1557,7 +1519,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
        }
        else
        {
-            convolution_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+            convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }
 #endif // __SSE2__
@@ -1583,7 +1545,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
        }
        else
        {
-            convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+            convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }


--- a/src/layer/x86/convolution_x86_avx2.cpp
+++ b/src/layer/x86/convolution_x86_avx2.cpp
@@ -18,6 +18,7 @@

 namespace ncnn {

+#include "convolution_packed_int8.h"
 #include "convolution_sgemm_int8.h"
 #include "convolution_sgemm_pack1to4_int8.h"
 #include "convolution_sgemm_pack8to1_int8.h"
@@ -25,6 +26,17 @@ namespace ncnn {
 #include "convolution_3x3_pack8to1_int8.h"
 #include "convolution_3x3_pack8to4_int8.h"

+// packed
+void convolution_transform_kernel_packed_int8_avx2(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    convolution_transform_kernel_packed_int8(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
+}
+
+void convolution_packed_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+}
+
 // pack1
 void im2col_sgemm_int8_sse_avx2(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {

--- a/src/layer/x86/convolution_x86_avx512vnni.cpp
+++ b/src/layer/x86/convolution_x86_avx512vnni.cpp
@@ -18,6 +18,7 @@

 namespace ncnn {

+#include "convolution_packed_int8.h"
 #include "convolution_sgemm_int8.h"
 #include "convolution_sgemm_pack1to4_int8.h"
 #include "convolution_sgemm_pack8to1_int8.h"
@@ -25,6 +26,12 @@ namespace ncnn {
 #include "convolution_3x3_pack8to1_int8.h"
 #include "convolution_3x3_pack8to4_int8.h"

+// packed
+void convolution_packed_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+}
+
 // pack1
 void im2col_sgemm_int8_sse_avx512vnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {

--- a/src/layer/x86/convolution_x86_avxvnni.cpp
+++ b/src/layer/x86/convolution_x86_avxvnni.cpp
@@ -18,6 +18,7 @@

 namespace ncnn {

+#include "convolution_packed_int8.h"
 #include "convolution_sgemm_int8.h"
 #include "convolution_sgemm_pack1to4_int8.h"
 #include "convolution_sgemm_pack8to1_int8.h"
@@ -25,6 +26,12 @@ namespace ncnn {
 #include "convolution_3x3_pack8to1_int8.h"
 #include "convolution_3x3_pack8to4_int8.h"

+// packed
+void convolution_packed_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+}
+
 // pack1
 void im2col_sgemm_int8_sse_avxvnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {

--- a/src/layer/x86/convolution_x86_xop.cpp
+++ b/src/layer/x86/convolution_x86_xop.cpp
@@ -18,6 +18,7 @@

 namespace ncnn {

+#include "convolution_packed_int8.h"
 #include "convolution_sgemm_int8.h"
 #include "convolution_sgemm_pack1to4_int8.h"
 #include "convolution_sgemm_pack8to1_int8.h"
@@ -25,6 +26,12 @@ namespace ncnn {
 #include "convolution_3x3_pack8to1_int8.h"
 #include "convolution_3x3_pack8to4_int8.h"

+// packed
+void convolution_packed_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+}
+
 // pack1
 void im2col_sgemm_int8_sse_xop(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {

--- a/src/layer/x86/x86_usability.h
+++ b/src/layer/x86/x86_usability.h
@@ -967,6 +967,60 @@ static NCNN_FORCEINLINE void transpose16x8_epi16(__m256i& _r0, __m256i& _r1, __m
    _r7 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
 }

+static NCNN_FORCEINLINE void transpose8x16_epi16(__m128i& _r0, __m128i& _r1, __m128i& _r2, __m128i& _r3, __m128i& _r4, __m128i& _r5, __m128i& _r6, __m128i& _r7, __m128i& _r8, __m128i& _r9, __m128i& _ra, __m128i& _rb, __m128i& _rc, __m128i& _rd, __m128i& _re, __m128i& _rf)
+{
+    __m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1);
+    __m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1);
+    __m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3);
+    __m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3);
+    __m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5);
+    __m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5);
+    __m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7);
+    __m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7);
+    __m128i _tmp8 = _mm_unpacklo_epi16(_r8, _r9);
+    __m128i _tmp9 = _mm_unpackhi_epi16(_r8, _r9);
+    __m128i _tmpa = _mm_unpacklo_epi16(_ra, _rb);
+    __m128i _tmpb = _mm_unpackhi_epi16(_ra, _rb);
+    __m128i _tmpc = _mm_unpacklo_epi16(_rc, _rd);
+    __m128i _tmpd = _mm_unpackhi_epi16(_rc, _rd);
+    __m128i _tmpe = _mm_unpacklo_epi16(_re, _rf);
+    __m128i _tmpf = _mm_unpackhi_epi16(_re, _rf);
+
+    __m128i _tmpg = _mm_unpacklo_epi32(_tmp0, _tmp2);
+    __m128i _tmph = _mm_unpackhi_epi32(_tmp0, _tmp2);
+    __m128i _tmpi = _mm_unpacklo_epi32(_tmp1, _tmp3);
+    __m128i _tmpj = _mm_unpackhi_epi32(_tmp1, _tmp3);
+    __m128i _tmpk = _mm_unpacklo_epi32(_tmp4, _tmp6);
+    __m128i _tmpl = _mm_unpackhi_epi32(_tmp4, _tmp6);
+    __m128i _tmpm = _mm_unpacklo_epi32(_tmp5, _tmp7);
+    __m128i _tmpn = _mm_unpackhi_epi32(_tmp5, _tmp7);
+    __m128i _tmpo = _mm_unpacklo_epi32(_tmp8, _tmpa);
+    __m128i _tmpp = _mm_unpackhi_epi32(_tmp8, _tmpa);
+    __m128i _tmpq = _mm_unpacklo_epi32(_tmp9, _tmpb);
+    __m128i _tmpr = _mm_unpackhi_epi32(_tmp9, _tmpb);
+    __m128i _tmps = _mm_unpacklo_epi32(_tmpc, _tmpe);
+    __m128i _tmpt = _mm_unpackhi_epi32(_tmpc, _tmpe);
+    __m128i _tmpu = _mm_unpacklo_epi32(_tmpd, _tmpf);
+    __m128i _tmpv = _mm_unpackhi_epi32(_tmpd, _tmpf);
+
+    _r0 = _mm_unpacklo_epi64(_tmpg, _tmpk);
+    _r1 = _mm_unpacklo_epi64(_tmpo, _tmps);
+    _r2 = _mm_unpackhi_epi64(_tmpg, _tmpk);
+    _r3 = _mm_unpackhi_epi64(_tmpo, _tmps);
+    _r4 = _mm_unpacklo_epi64(_tmph, _tmpl);
+    _r5 = _mm_unpacklo_epi64(_tmpp, _tmpt);
+    _r6 = _mm_unpackhi_epi64(_tmph, _tmpl);
+    _r7 = _mm_unpackhi_epi64(_tmpp, _tmpt);
+    _r8 = _mm_unpacklo_epi64(_tmpi, _tmpm);
+    _r9 = _mm_unpacklo_epi64(_tmpq, _tmpu);
+    _ra = _mm_unpackhi_epi64(_tmpi, _tmpm);
+    _rb = _mm_unpackhi_epi64(_tmpq, _tmpu);
+    _rc = _mm_unpacklo_epi64(_tmpj, _tmpn);
+    _rd = _mm_unpacklo_epi64(_tmpr, _tmpv);
+    _re = _mm_unpackhi_epi64(_tmpj, _tmpn);
+    _rf = _mm_unpackhi_epi64(_tmpr, _tmpv);
+}
+
 static NCNN_FORCEINLINE float _mm512_comp_reduce_add_ps(__m512 x)
 {
    const __m256 x256 = _mm256_add_ps(_mm512_castps512_ps256(x), _mm512_extractf32x8_ps(x, 1));

--- a/tests/test_convolution.cpp
+++ b/tests/test_convolution.cpp
@@ -46,6 +46,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
    if (ret != 0)
    {
        fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+        return ret;
    }

    {
@@ -65,6 +66,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
        if (ret != 0)
        {
            fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+            return ret;
        }
    }

@@ -85,6 +87,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
        if (ret != 0)
        {
            fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+            return ret;
        }
    }

@@ -98,6 +101,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
        if (ret != 0)
        {
            fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+            return ret;
        }
    }
 #endif // __aarch64__

--- a/tests/test_convolution_1.cpp
+++ b/tests/test_convolution_1.cpp
@@ -46,6 +46,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
    if (ret != 0)
    {
        fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+        return ret;
    }

    {
@@ -65,6 +66,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
        if (ret != 0)
        {
            fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+            return ret;
        }
    }

@@ -85,6 +87,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
        if (ret != 0)
        {
            fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+            return ret;
        }
    }

@@ -98,6 +101,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
        if (ret != 0)
        {
            fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+            return ret;
        }
    }
 #endif // __aarch64__

--- a/tests/test_convolution_2.cpp
+++ b/tests/test_convolution_2.cpp
@@ -48,6 +48,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
    if (ret != 0)
    {
        fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+        return ret;
    }

    {
@@ -67,6 +68,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
        if (ret != 0)
        {
            fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+            return ret;
        }
    }

@@ -87,6 +89,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
        if (ret != 0)
        {
            fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+            return ret;
        }
    }

@@ -99,6 +102,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
        if (ret != 0)
        {
            fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+            return ret;
        }
    }


--- a/tests/test_convolution_3.cpp
+++ b/tests/test_convolution_3.cpp
@@ -187,6 +187,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
    if (ret != 0)
    {
        fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
+        return ret;
    }

    {
@@ -206,6 +207,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
        if (ret != 0)
        {
            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
+            return ret;
        }
    }

@@ -226,6 +228,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
        if (ret != 0)
        {
            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
+            return ret;
        }
    }

@@ -309,6 +312,82 @@ static int test_convolution_1()
           || test_convolution_int8(25, 33, 16, 15, 3, 1, 1, 1, 0)
           || test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0);
 }
+
+static int test_convolution_1_2()
+{
+    return 0
+           || test_convolution_int8(19, 17, 1, 1, 3, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 2, 1, 3, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 7, 1, 3, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 8, 1, 3, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 15, 1, 3, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 16, 1, 3, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 31, 1, 3, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 32, 1, 3, 2, 2, 0, 0)
+
+           || test_convolution_int8(19, 17, 1, 2, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 2, 2, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 7, 2, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 8, 2, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 15, 2, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 16, 2, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 31, 2, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 32, 2, 5, 2, 2, 0, 0)
+
+           || test_convolution_int8(19, 17, 1, 7, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 2, 7, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 7, 7, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 8, 7, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 15, 7, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 16, 7, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 31, 7, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 32, 7, 5, 2, 2, 0, 0)
+
+           || test_convolution_int8(19, 17, 1, 8, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 2, 8, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 7, 8, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 8, 8, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 15, 8, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 16, 8, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 31, 8, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 32, 8, 5, 2, 2, 0, 0)
+
+           || test_convolution_int8(19, 17, 1, 15, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 2, 15, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 7, 15, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 8, 15, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 15, 15, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 16, 15, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 31, 15, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 32, 15, 5, 2, 2, 0, 0)
+
+           || test_convolution_int8(19, 17, 1, 16, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 2, 16, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 7, 16, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 8, 16, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 15, 16, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 16, 16, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 31, 16, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 32, 16, 5, 2, 2, 0, 0)
+
+           || test_convolution_int8(19, 17, 1, 31, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 2, 31, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 7, 31, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 8, 31, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 15, 31, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 16, 31, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 31, 31, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 32, 31, 5, 2, 2, 0, 0)
+
+           || test_convolution_int8(19, 17, 1, 32, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 2, 32, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 7, 32, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 8, 32, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 15, 32, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 16, 32, 5, 2, 2, 0, 0)
+           || test_convolution_int8(19, 17, 31, 32, 5, 2, 2, 0, 1)
+           || test_convolution_int8(19, 17, 32, 32, 5, 2, 2, 0, 0);
+}
 #endif // NCNN_INT8

 int main()
@@ -318,6 +397,7 @@ int main()
 #if NCNN_INT8
    return 0
           || test_convolution_1()
+           || test_convolution_1_2()
           || test_convolution_2()
           || test_convolution_3();
 #else