shadowed variable for less openmp task args (#4744)

7883f4d0 · nihui · GitHub · 1d6bfdca · 7883f4d0 · 7883f4d0
14 changed file
--- a/src/layer/arm/convolution1d_packed.h
+++ b/src/layer/arm/convolution1d_packed.h
@@ -525,6 +525,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
    {
        const int p = remain_outh_start + pp * 8;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+        const int out_elempack = top_blob.elempack;
+
        float* outptr = top_blob.row(p / out_elempack);

        for (int j = 0; j < outw; j++)
@@ -743,6 +749,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
    {
        const int p = remain_outh_start + pp * 4;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+        const int out_elempack = top_blob.elempack;
+
        float* outptr = top_blob.row(p / out_elempack);

        for (int j = 0; j < outw; j++)
@@ -939,6 +951,11 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
    {
        const int p = remain_outh_start + pp * 2;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+
        float* outptr0 = top_blob.row(p);
        float* outptr1 = top_blob.row(p + 1);


--- a/src/layer/arm/convolution1d_packed_bf16s.h
+++ b/src/layer/arm/convolution1d_packed_bf16s.h
@@ -525,6 +525,12 @@ static void convolution1d_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, co
    {
        const int p = remain_outh_start + pp * 8;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+        const int out_elempack = top_blob.elempack;
+
        unsigned short* outptr = top_blob.row<unsigned short>(p / out_elempack);

        for (int j = 0; j < outw; j++)
@@ -762,6 +768,12 @@ static void convolution1d_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, co
    {
        const int p = remain_outh_start + pp * 4;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+        const int out_elempack = top_blob.elempack;
+
        unsigned short* outptr = top_blob.row<unsigned short>(p / out_elempack);

        for (int j = 0; j < outw; j++)
@@ -968,6 +980,11 @@ static void convolution1d_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, co
    {
        const int p = remain_outh_start + pp * 2;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+
        unsigned short* outptr0 = top_blob.row<unsigned short>(p);
        unsigned short* outptr1 = top_blob.row<unsigned short>(p + 1);


--- a/src/layer/arm/convolution1d_packed_fp16s.h
+++ b/src/layer/arm/convolution1d_packed_fp16s.h
@@ -474,6 +474,12 @@ static void convolution1d_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, co
    {
        const int p = remain_outh_start + pp * 8;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+        const int out_elempack = top_blob.elempack;
+
        __fp16* outptr = top_blob.row<__fp16>(p / out_elempack);

        for (int j = 0; j < outw; j++)
@@ -707,6 +713,12 @@ static void convolution1d_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, co
    {
        const int p = remain_outh_start + pp * 4;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+        const int out_elempack = top_blob.elempack;
+
        __fp16* outptr = top_blob.row<__fp16>(p / out_elempack);

        for (int j = 0; j < outw; j++)
@@ -887,6 +899,11 @@ static void convolution1d_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, co
    {
        const int p = remain_outh_start + pp * 2;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+
        __fp16* outptr0 = top_blob.row<__fp16>(p);
        __fp16* outptr1 = top_blob.row<__fp16>(p + 1);

@@ -1206,6 +1223,12 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
    {
        const int p = remain_outh_start + pp * 8;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+        const int out_elempack = top_blob.elempack;
+
        __fp16* outptr = top_blob.row<__fp16>(p / out_elempack);

        for (int j = 0; j < outw; j++)
@@ -1388,6 +1411,12 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
    {
        const int p = remain_outh_start + pp * 4;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+        const int out_elempack = top_blob.elempack;
+
        __fp16* outptr = top_blob.row<__fp16>(p / out_elempack);

        for (int j = 0; j < outw; j++)
@@ -1565,6 +1594,11 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
    {
        const int p = remain_outh_start + pp * 2;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+
        __fp16* outptr0 = top_blob.row<__fp16>(p);
        __fp16* outptr1 = top_blob.row<__fp16>(p + 1);


--- a/src/layer/arm/convolution_packed.h
+++ b/src/layer/arm/convolution_packed.h
@@ -550,6 +550,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
    {
        const int p = remain_outch_start + pp * 8;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int out_elempack = top_blob.elempack;
+
        float* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
@@ -768,6 +775,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
    {
        const int p = remain_outch_start + pp * 4;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int out_elempack = top_blob.elempack;
+
        float* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
@@ -964,6 +978,12 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
    {
        const int p = remain_outch_start + pp * 2;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
        float* outptr0 = top_blob.channel(p);
        float* outptr1 = top_blob.channel(p + 1);


--- a/src/layer/arm/convolution_packed_bf16s.h
+++ b/src/layer/arm/convolution_packed_bf16s.h
@@ -550,6 +550,13 @@ static void convolution_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, cons
    {
        const int p = remain_outch_start + pp * 8;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int out_elempack = top_blob.elempack;
+
        unsigned short* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
@@ -787,6 +794,13 @@ static void convolution_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, cons
    {
        const int p = remain_outch_start + pp * 4;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int out_elempack = top_blob.elempack;
+
        unsigned short* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
@@ -993,6 +1007,12 @@ static void convolution_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, cons
    {
        const int p = remain_outch_start + pp * 2;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
        unsigned short* outptr0 = top_blob.channel(p);
        unsigned short* outptr1 = top_blob.channel(p + 1);


--- a/src/layer/arm/convolution_packed_fp16s.h
+++ b/src/layer/arm/convolution_packed_fp16s.h
@@ -499,6 +499,13 @@ static void convolution_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
    {
        const int p = remain_outch_start + pp * 8;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int out_elempack = top_blob.elempack;
+
        __fp16* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
@@ -732,6 +739,13 @@ static void convolution_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
    {
        const int p = remain_outch_start + pp * 4;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int out_elempack = top_blob.elempack;
+
        __fp16* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
@@ -912,6 +926,12 @@ static void convolution_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
    {
        const int p = remain_outch_start + pp * 2;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
        __fp16* outptr0 = top_blob.channel(p);
        __fp16* outptr1 = top_blob.channel(p + 1);

@@ -1254,6 +1274,13 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
    {
        const int p = remain_outch_start + pp * 8;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int out_elempack = top_blob.elempack;
+
        __fp16* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
@@ -1435,6 +1462,13 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
    {
        const int p = remain_outch_start + pp * 4;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int out_elempack = top_blob.elempack;
+
        __fp16* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
@@ -1611,6 +1645,12 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
    {
        const int p = remain_outch_start + pp * 2;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
        __fp16* outptr0 = top_blob.channel(p);
        __fp16* outptr1 = top_blob.channel(p + 1);


--- a/src/layer/arm/gemm_arm.cpp
+++ b/src/layer/arm/gemm_arm.cpp
@@ -3839,6 +3839,10 @@ static int gemm_arm(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int
    {
        const int i = ppi * TILE_M;

+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
@@ -4013,6 +4017,10 @@ static int gemm_BT_arm(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob,
    {
        const int i = ppi * TILE_M;

+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
@@ -4548,6 +4556,10 @@ static int gemm_arm_bf16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo
    {
        const int i = ppi * TILE_M;

+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
@@ -4724,6 +4736,10 @@ static int gemm_BT_arm_bf16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top
    {
        const int i = ppi * TILE_M;

+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;

--- a/src/layer/arm/gemm_arm_asimdhp.cpp
+++ b/src/layer/arm/gemm_arm_asimdhp.cpp
@@ -2398,6 +2398,10 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl
    {
        const int i = ppi * TILE_M;

+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
@@ -2572,6 +2576,10 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to
    {
        const int i = ppi * TILE_M;

+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;

--- a/src/layer/arm/gemm_arm_vfpv4.cpp
+++ b/src/layer/arm/gemm_arm_vfpv4.cpp
@@ -85,6 +85,10 @@ static int gemm_arm_fp16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo
    {
        const int i = ppi * TILE_M;

+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
@@ -261,6 +265,10 @@ static int gemm_BT_arm_fp16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top
    {
        const int i = ppi * TILE_M;

+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;

--- a/src/layer/arm/gemm_fp16s.h
+++ b/src/layer/arm/gemm_fp16s.h
@@ -268,6 +268,7 @@ static void pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int max_jj, i
    unsigned short* pp = BT;

    int jj = 0;
+#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
@@ -358,6 +359,7 @@ static void pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int max_jj, i
            pb++;
        }
    }
+#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
@@ -571,6 +573,7 @@ static void transpose_pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int
    unsigned short* pp = BT;

    int jj = 0;
+#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);
@@ -585,6 +588,7 @@ static void transpose_pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int
            p0 += B_hstep;
        }
    }
+#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);
@@ -1986,6 +1990,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
        }

        int jj = 0;
+#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum0;
@@ -2301,6 +2306,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile

            outptr += 48;
        }
+#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum0;
@@ -2871,6 +2877,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
        }

        int jj = 0;
+#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum00;
@@ -3042,6 +3049,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile

            outptr += 24;
        }
+#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum00;
@@ -3517,6 +3525,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
        }

        int jj = 0;
+#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum0;
@@ -3620,6 +3629,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile

            outptr += 12;
        }
+#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum0;

--- a/src/layer/gemm.cpp
+++ b/src/layer/gemm.cpp
@@ -174,9 +174,9 @@ int Gemm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
        B = B0;
    }

-    int M = A.dims == 3 ? A.c : A.h;
-    int K = A.w; // assert A.w == B.w
-    int N = B.dims == 3 ? B.c : B.h;
+    const int M = A.dims == 3 ? A.c : A.h;
+    const int K = A.w; // assert A.w == B.w
+    const int N = B.dims == 3 ? B.c : B.h;

    const float* ptrC = 0;
    int broadcast_type_C = 0;

--- a/src/layer/x86/convolution1d_packed.h
+++ b/src/layer/x86/convolution1d_packed.h
@@ -999,6 +999,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
    {
        const int p = pp * 16;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+        const int out_elempack = top_blob.elempack;
+
        float* outptr = top_blob.row(p / out_elempack);

        for (int j = 0; j < outw; j++)
@@ -1423,6 +1429,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
    {
        const int p = remain_outh_start + pp * 8;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+        const int out_elempack = top_blob.elempack;
+
        float* outptr = top_blob.row(p / out_elempack);

        for (int j = 0; j < outw; j++)
@@ -1837,6 +1849,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
    {
        const int p = remain_outh_start + pp * 4;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+        const int out_elempack = top_blob.elempack;
+
        float* outptr = top_blob.row(p / out_elempack);

        for (int j = 0; j < outw; j++)
@@ -2245,6 +2263,11 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
    {
        const int p = remain_outh_start + pp * 2;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inh = bottom_blob.h * elempack;
+        const int outw = top_blob.w;
+
        float* outptr0 = top_blob.row(p);
        float* outptr1 = top_blob.row(p + 1);


--- a/src/layer/x86/convolution_packed.h
+++ b/src/layer/x86/convolution_packed.h
@@ -1024,6 +1024,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
    {
        const int p = pp * 16;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int out_elempack = top_blob.elempack;
+
        float* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
@@ -1460,6 +1467,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
    {
        const int p = remain_outch_start + pp * 8;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int out_elempack = top_blob.elempack;
+
        float* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
@@ -1886,6 +1900,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
    {
        const int p = remain_outch_start + pp * 4;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int out_elempack = top_blob.elempack;
+
        float* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
@@ -2306,6 +2327,12 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
    {
        const int p = remain_outch_start + pp * 2;

+        // shadowed variable for less openmp task args
+        const int elempack = bottom_blob.elempack;
+        const int inch = bottom_blob.c * elempack;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
        float* outptr0 = top_blob.channel(p);
        float* outptr1 = top_blob.channel(p + 1);


--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -6897,6 +6897,10 @@ static int gemm_x86(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int
    {
        const int i = ppi * TILE_M;

+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
@@ -7071,6 +7075,10 @@ static int gemm_BT_x86(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob,
    {
        const int i = ppi * TILE_M;

+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;