未验证 提交 7883f4d0 编写于 作者: N nihui 提交者: GitHub

shadowed variable for less openmp task args (#4744)

上级 1d6bfdca
......@@ -525,6 +525,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const int p = remain_outh_start + pp * 8;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
const int out_elempack = top_blob.elempack;
float* outptr = top_blob.row(p / out_elempack);
for (int j = 0; j < outw; j++)
......@@ -743,6 +749,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const int p = remain_outh_start + pp * 4;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
const int out_elempack = top_blob.elempack;
float* outptr = top_blob.row(p / out_elempack);
for (int j = 0; j < outw; j++)
......@@ -939,6 +951,11 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const int p = remain_outh_start + pp * 2;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
float* outptr0 = top_blob.row(p);
float* outptr1 = top_blob.row(p + 1);
......
......@@ -525,6 +525,12 @@ static void convolution1d_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, co
{
const int p = remain_outh_start + pp * 8;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
const int out_elempack = top_blob.elempack;
unsigned short* outptr = top_blob.row<unsigned short>(p / out_elempack);
for (int j = 0; j < outw; j++)
......@@ -762,6 +768,12 @@ static void convolution1d_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, co
{
const int p = remain_outh_start + pp * 4;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
const int out_elempack = top_blob.elempack;
unsigned short* outptr = top_blob.row<unsigned short>(p / out_elempack);
for (int j = 0; j < outw; j++)
......@@ -968,6 +980,11 @@ static void convolution1d_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, co
{
const int p = remain_outh_start + pp * 2;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
unsigned short* outptr0 = top_blob.row<unsigned short>(p);
unsigned short* outptr1 = top_blob.row<unsigned short>(p + 1);
......
......@@ -474,6 +474,12 @@ static void convolution1d_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, co
{
const int p = remain_outh_start + pp * 8;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
const int out_elempack = top_blob.elempack;
__fp16* outptr = top_blob.row<__fp16>(p / out_elempack);
for (int j = 0; j < outw; j++)
......@@ -707,6 +713,12 @@ static void convolution1d_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, co
{
const int p = remain_outh_start + pp * 4;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
const int out_elempack = top_blob.elempack;
__fp16* outptr = top_blob.row<__fp16>(p / out_elempack);
for (int j = 0; j < outw; j++)
......@@ -887,6 +899,11 @@ static void convolution1d_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, co
{
const int p = remain_outh_start + pp * 2;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
__fp16* outptr0 = top_blob.row<__fp16>(p);
__fp16* outptr1 = top_blob.row<__fp16>(p + 1);
......@@ -1206,6 +1223,12 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
{
const int p = remain_outh_start + pp * 8;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
const int out_elempack = top_blob.elempack;
__fp16* outptr = top_blob.row<__fp16>(p / out_elempack);
for (int j = 0; j < outw; j++)
......@@ -1388,6 +1411,12 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
{
const int p = remain_outh_start + pp * 4;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
const int out_elempack = top_blob.elempack;
__fp16* outptr = top_blob.row<__fp16>(p / out_elempack);
for (int j = 0; j < outw; j++)
......@@ -1565,6 +1594,11 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
{
const int p = remain_outh_start + pp * 2;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
__fp16* outptr0 = top_blob.row<__fp16>(p);
__fp16* outptr1 = top_blob.row<__fp16>(p + 1);
......
......@@ -550,6 +550,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const int p = remain_outch_start + pp * 8;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int out_elempack = top_blob.elempack;
float* outptr = top_blob.channel(p / out_elempack);
for (int i = 0; i < outh; i++)
......@@ -768,6 +775,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const int p = remain_outch_start + pp * 4;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int out_elempack = top_blob.elempack;
float* outptr = top_blob.channel(p / out_elempack);
for (int i = 0; i < outh; i++)
......@@ -964,6 +978,12 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const int p = remain_outch_start + pp * 2;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
float* outptr0 = top_blob.channel(p);
float* outptr1 = top_blob.channel(p + 1);
......
......@@ -550,6 +550,13 @@ static void convolution_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const int p = remain_outch_start + pp * 8;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int out_elempack = top_blob.elempack;
unsigned short* outptr = top_blob.channel(p / out_elempack);
for (int i = 0; i < outh; i++)
......@@ -787,6 +794,13 @@ static void convolution_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const int p = remain_outch_start + pp * 4;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int out_elempack = top_blob.elempack;
unsigned short* outptr = top_blob.channel(p / out_elempack);
for (int i = 0; i < outh; i++)
......@@ -993,6 +1007,12 @@ static void convolution_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const int p = remain_outch_start + pp * 2;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
unsigned short* outptr0 = top_blob.channel(p);
unsigned short* outptr1 = top_blob.channel(p + 1);
......
......@@ -499,6 +499,13 @@ static void convolution_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const int p = remain_outch_start + pp * 8;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int out_elempack = top_blob.elempack;
__fp16* outptr = top_blob.channel(p / out_elempack);
for (int i = 0; i < outh; i++)
......@@ -732,6 +739,13 @@ static void convolution_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const int p = remain_outch_start + pp * 4;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int out_elempack = top_blob.elempack;
__fp16* outptr = top_blob.channel(p / out_elempack);
for (int i = 0; i < outh; i++)
......@@ -912,6 +926,12 @@ static void convolution_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const int p = remain_outch_start + pp * 2;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
__fp16* outptr0 = top_blob.channel(p);
__fp16* outptr1 = top_blob.channel(p + 1);
......@@ -1254,6 +1274,13 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
{
const int p = remain_outch_start + pp * 8;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int out_elempack = top_blob.elempack;
__fp16* outptr = top_blob.channel(p / out_elempack);
for (int i = 0; i < outh; i++)
......@@ -1435,6 +1462,13 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
{
const int p = remain_outch_start + pp * 4;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int out_elempack = top_blob.elempack;
__fp16* outptr = top_blob.channel(p / out_elempack);
for (int i = 0; i < outh; i++)
......@@ -1611,6 +1645,12 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
{
const int p = remain_outch_start + pp * 2;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
__fp16* outptr0 = top_blob.channel(p);
__fp16* outptr1 = top_blob.channel(p + 1);
......
......@@ -3839,6 +3839,10 @@ static int gemm_arm(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int
{
const int i = ppi * TILE_M;
// shadowed variable for less openmp task args
const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
const int max_ii = std::min((M - i), TILE_M);
Mat topT_tile;
......@@ -4013,6 +4017,10 @@ static int gemm_BT_arm(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob,
{
const int i = ppi * TILE_M;
// shadowed variable for less openmp task args
const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
const int max_ii = std::min((M - i), TILE_M);
Mat topT_tile;
......@@ -4548,6 +4556,10 @@ static int gemm_arm_bf16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo
{
const int i = ppi * TILE_M;
// shadowed variable for less openmp task args
const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
const int max_ii = std::min((M - i), TILE_M);
Mat topT_tile;
......@@ -4724,6 +4736,10 @@ static int gemm_BT_arm_bf16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top
{
const int i = ppi * TILE_M;
// shadowed variable for less openmp task args
const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
const int max_ii = std::min((M - i), TILE_M);
Mat topT_tile;
......
......@@ -2398,6 +2398,10 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl
{
const int i = ppi * TILE_M;
// shadowed variable for less openmp task args
const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
const int max_ii = std::min((M - i), TILE_M);
Mat topT_tile;
......@@ -2572,6 +2576,10 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to
{
const int i = ppi * TILE_M;
// shadowed variable for less openmp task args
const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
const int max_ii = std::min((M - i), TILE_M);
Mat topT_tile;
......
......@@ -85,6 +85,10 @@ static int gemm_arm_fp16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo
{
const int i = ppi * TILE_M;
// shadowed variable for less openmp task args
const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
const int max_ii = std::min((M - i), TILE_M);
Mat topT_tile;
......@@ -261,6 +265,10 @@ static int gemm_BT_arm_fp16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top
{
const int i = ppi * TILE_M;
// shadowed variable for less openmp task args
const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
const int max_ii = std::min((M - i), TILE_M);
Mat topT_tile;
......
......@@ -268,6 +268,7 @@ static void pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int max_jj, i
unsigned short* pp = BT;
int jj = 0;
#if __aarch64__
for (; jj + 11 < max_jj; jj += 12)
{
const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
......@@ -358,6 +359,7 @@ static void pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int max_jj, i
pb++;
}
}
#endif // __aarch64__
for (; jj + 7 < max_jj; jj += 8)
{
const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
......@@ -571,6 +573,7 @@ static void transpose_pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int
unsigned short* pp = BT;
int jj = 0;
#if __aarch64__
for (; jj + 11 < max_jj; jj += 12)
{
const float* p0 = (const float*)B + k * B_hstep + (j + jj);
......@@ -585,6 +588,7 @@ static void transpose_pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int
p0 += B_hstep;
}
}
#endif // __aarch64__
for (; jj + 7 < max_jj; jj += 8)
{
const float* p0 = (const float*)B + k * B_hstep + (j + jj);
......@@ -1986,6 +1990,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
}
int jj = 0;
#if __aarch64__
for (; jj + 11 < max_jj; jj += 12)
{
float32x4_t _sum0;
......@@ -2301,6 +2306,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
outptr += 48;
}
#endif // __aarch64__
for (; jj + 7 < max_jj; jj += 8)
{
float32x4_t _sum0;
......@@ -2871,6 +2877,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
}
int jj = 0;
#if __aarch64__
for (; jj + 11 < max_jj; jj += 12)
{
float32x4_t _sum00;
......@@ -3042,6 +3049,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
outptr += 24;
}
#endif // __aarch64__
for (; jj + 7 < max_jj; jj += 8)
{
float32x4_t _sum00;
......@@ -3517,6 +3525,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
}
int jj = 0;
#if __aarch64__
for (; jj + 11 < max_jj; jj += 12)
{
float32x4_t _sum0;
......@@ -3620,6 +3629,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
outptr += 12;
}
#endif // __aarch64__
for (; jj + 7 < max_jj; jj += 8)
{
float32x4_t _sum0;
......
......@@ -174,9 +174,9 @@ int Gemm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
B = B0;
}
int M = A.dims == 3 ? A.c : A.h;
int K = A.w; // assert A.w == B.w
int N = B.dims == 3 ? B.c : B.h;
const int M = A.dims == 3 ? A.c : A.h;
const int K = A.w; // assert A.w == B.w
const int N = B.dims == 3 ? B.c : B.h;
const float* ptrC = 0;
int broadcast_type_C = 0;
......
......@@ -999,6 +999,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const int p = pp * 16;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
const int out_elempack = top_blob.elempack;
float* outptr = top_blob.row(p / out_elempack);
for (int j = 0; j < outw; j++)
......@@ -1423,6 +1429,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const int p = remain_outh_start + pp * 8;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
const int out_elempack = top_blob.elempack;
float* outptr = top_blob.row(p / out_elempack);
for (int j = 0; j < outw; j++)
......@@ -1837,6 +1849,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const int p = remain_outh_start + pp * 4;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
const int out_elempack = top_blob.elempack;
float* outptr = top_blob.row(p / out_elempack);
for (int j = 0; j < outw; j++)
......@@ -2245,6 +2263,11 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const int p = remain_outh_start + pp * 2;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inh = bottom_blob.h * elempack;
const int outw = top_blob.w;
float* outptr0 = top_blob.row(p);
float* outptr1 = top_blob.row(p + 1);
......
......@@ -1024,6 +1024,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const int p = pp * 16;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int out_elempack = top_blob.elempack;
float* outptr = top_blob.channel(p / out_elempack);
for (int i = 0; i < outh; i++)
......@@ -1460,6 +1467,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const int p = remain_outch_start + pp * 8;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int out_elempack = top_blob.elempack;
float* outptr = top_blob.channel(p / out_elempack);
for (int i = 0; i < outh; i++)
......@@ -1886,6 +1900,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const int p = remain_outch_start + pp * 4;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int out_elempack = top_blob.elempack;
float* outptr = top_blob.channel(p / out_elempack);
for (int i = 0; i < outh; i++)
......@@ -2306,6 +2327,12 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const int p = remain_outch_start + pp * 2;
// shadowed variable for less openmp task args
const int elempack = bottom_blob.elempack;
const int inch = bottom_blob.c * elempack;
const int outw = top_blob.w;
const int outh = top_blob.h;
float* outptr0 = top_blob.channel(p);
float* outptr1 = top_blob.channel(p + 1);
......
......@@ -6897,6 +6897,10 @@ static int gemm_x86(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int
{
const int i = ppi * TILE_M;
// shadowed variable for less openmp task args
const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
const int max_ii = std::min((M - i), TILE_M);
Mat topT_tile;
......@@ -7071,6 +7075,10 @@ static int gemm_BT_x86(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob,
{
const int i = ppi * TILE_M;
// shadowed variable for less openmp task args
const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
const int max_ii = std::min((M - i), TILE_M);
Mat topT_tile;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册