未验证 提交 dabc4c06 编写于 作者: N nihui 提交者: GitHub

arm convolution winograd unified elempack (#4556)

* update f43 coeffs

* arm convolution winograd unified elempack

* disable bf16s test atm

* test gnu inline asm off
上级 6f08ec73
......@@ -244,6 +244,13 @@ jobs:
linux-gcc-arm:
name: linux-gcc-arm
strategy:
matrix:
# openmp: ['OFF', 'ON']
include:
- { GNU_INLINE_ASM: 'ON'}
- { GNU_INLINE_ASM: 'OFF'}
runs-on:
pool-name: docker
container:
......@@ -300,7 +307,7 @@ jobs:
- name: build
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_VFPV4=ON -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_GNU_INLINE_ASM=${{matrix.GNU_INLINE_ASM}} -DNCNN_VFPV4=ON -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . -j $(nproc)
- name: test
run: |
......@@ -318,7 +325,7 @@ jobs:
- name: build-armhf-vfpv3-d16
run: |
mkdir build-armhf-vfpv3-d16 && cd build-armhf-vfpv3-d16
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_VFPV4=OFF -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_GNU_INLINE_ASM=${{matrix.GNU_INLINE_ASM}} -DNCNN_VFPV4=OFF -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . -j $(nproc)
- name: test-armhf-vfpv3-d16
run: |
......@@ -344,10 +351,11 @@ jobs:
matrix:
# openmp: ['OFF', 'ON']
include:
- { ARM82: 'OFF', ARM82DOT: 'OFF', ARM82FP16FML: 'OFF', ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'}
- { ARM82: 'ON', ARM82DOT: 'OFF', ARM82FP16FML: 'OFF', ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'}
- { ARM82: 'ON', ARM82DOT: 'ON', ARM82FP16FML: 'ON', ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'}
- { ARM82: 'ON', ARM82DOT: 'ON', ARM82FP16FML: 'ON', ARM84BF16: 'ON', ARM84I8MM: 'ON', ARM86SVE: 'OFF'}
- { GNU_INLINE_ASM: 'ON', ARM82: 'OFF', ARM82DOT: 'OFF', ARM82FP16FML: 'OFF', ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'}
- { GNU_INLINE_ASM: 'ON', ARM82: 'ON', ARM82DOT: 'OFF', ARM82FP16FML: 'OFF', ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'}
- { GNU_INLINE_ASM: 'ON', ARM82: 'ON', ARM82DOT: 'ON', ARM82FP16FML: 'ON', ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'}
- { GNU_INLINE_ASM: 'ON', ARM82: 'ON', ARM82DOT: 'ON', ARM82FP16FML: 'ON', ARM84BF16: 'ON', ARM84I8MM: 'ON', ARM86SVE: 'OFF'}
- { GNU_INLINE_ASM: 'OFF', ARM82: 'ON', ARM82DOT: 'ON', ARM82FP16FML: 'ON', ARM84BF16: 'ON', ARM84I8MM: 'ON', ARM86SVE: 'OFF'}
runs-on:
pool-name: docker
......@@ -407,6 +415,7 @@ jobs:
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake \
-DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON \
-DNCNN_GNU_INLINE_ASM=${{matrix.GNU_INLINE_ASM}} \
-DNCNN_ARM82=${{matrix.ARM82}} \
-DNCNN_ARM82DOT=${{matrix.ARM82DOT}} \
-DNCNN_ARM82FP16FML=${{matrix.ARM82FP16FML}} \
......
此差异已折叠。
......@@ -12,607 +12,6 @@
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static void conv3x3s1_winograd63_transform_kernel_pack4_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
// winograd63 transform kernel
Mat kernel_tm;
kernel_tm.create(8 * 8, inch, outch);
const float ktm[8][3] = {
{1.0f, 0.0f, 0.0f},
{-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9},
{1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
for (int q = 0; q < inch; q++)
{
const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
float* kernel_tm0 = kernel_tm.channel(p).row(q);
// transform kernel, transposed
const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;
// h
float tmp[8][3];
for (int i = 0; i < 8; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}
// v
for (int j = 0; j < 8; j++)
{
float* tmpp = &tmp[j][0];
for (int i = 0; i < 8; i++)
{
kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}
// interleave
// src = 64-inch-outch
// dst = 4b-4a-inch/4a-64-outch/4b;
#if __aarch64__
kernel_tm_pack4.create(2 * inch / 4, 64, (outch / 4) / 2 + (outch / 4) % 2, (size_t)4u * 16, 16);
#else
kernel_tm_pack4.create(inch / 4, 64, outch / 4, (size_t)4u * 16, 16);
#endif
int q = 0;
#if __aarch64__
for (; q + 7 < outch; q += 8)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
const Mat k4 = kernel_tm.channel(q + 4);
const Mat k5 = kernel_tm.channel(q + 5);
const Mat k6 = kernel_tm.channel(q + 6);
const Mat k7 = kernel_tm.channel(q + 7);
Mat g0 = kernel_tm_pack4.channel(q / 8);
for (int k = 0; k < 64; k++)
{
float* g00 = g0.row(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
const float* k40 = k4.row(p + i);
const float* k50 = k5.row(p + i);
const float* k60 = k6.row(p + i);
const float* k70 = k7.row(p + i);
g00[0] = k00[k];
g00[1] = k10[k];
g00[2] = k20[k];
g00[3] = k30[k];
g00[4] = k40[k];
g00[5] = k50[k];
g00[6] = k60[k];
g00[7] = k70[k];
g00 += 8;
}
}
}
}
#endif // __aarch64__
for (; q + 3 < outch; q += 4)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
#if __aarch64__
Mat g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
#else
Mat g0 = kernel_tm_pack4.channel(q / 4);
#endif
for (int k = 0; k < 64; k++)
{
float* g00 = g0.row(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
g00[0] = k00[k];
g00[1] = k10[k];
g00[2] = k20[k];
g00[3] = k30[k];
g00 += 4;
}
}
}
}
}
static void conv3x3s1_winograd63_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 6n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 5) / 6 * 6;
outh = (outh + 5) / 6 * 6;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 6;
int h_tiles = outh / 6;
int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 64, inch, 16u, 4, opt.workspace_allocator);
conv3x3s1_winograd63_transform_input_pack4_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack4_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, 16u, 4, opt.workspace_allocator);
}
{
conv3x3s1_winograd63_transform_output_pack4_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s1_winograd43_transform_kernel_pack4_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
// winograd43 transform kernel
Mat kernel_tm(6 * 6, inch, outch);
const float ktm[6][3] = {
{1.0f / 4, 0.0f, 0.0f},
{-1.0f / 6, -1.0f / 6, -1.0f / 6},
{-1.0f / 6, 1.0f / 6, -1.0f / 6},
{1.0f / 24, 1.0f / 12, 1.0f / 6},
{1.0f / 24, -1.0f / 12, 1.0f / 6},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
for (int q = 0; q < inch; q++)
{
const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
float* kernel_tm0 = kernel_tm.channel(p).row(q);
// transform kernel
const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;
// h
float tmp[6][3];
for (int i = 0; i < 6; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}
// U
for (int j = 0; j < 6; j++)
{
float* tmpp = &tmp[j][0];
for (int i = 0; i < 6; i++)
{
kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}
// interleave
// src = 36-inch-outch
// dst = 4b-4a-inch/4a-36-outch/4b;
#if __aarch64__
kernel_tm_pack4.create(2 * inch / 4, 36, (outch / 4) / 2 + (outch / 4) % 2, (size_t)4u * 16, 16);
#else
kernel_tm_pack4.create(inch / 4, 36, outch / 4, (size_t)4u * 16, 16);
#endif
int q = 0;
#if __aarch64__
for (; q + 7 < outch; q += 8)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
const Mat k4 = kernel_tm.channel(q + 4);
const Mat k5 = kernel_tm.channel(q + 5);
const Mat k6 = kernel_tm.channel(q + 6);
const Mat k7 = kernel_tm.channel(q + 7);
Mat g0 = kernel_tm_pack4.channel(q / 8);
for (int k = 0; k < 36; k++)
{
float* g00 = g0.row(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
const float* k40 = k4.row(p + i);
const float* k50 = k5.row(p + i);
const float* k60 = k6.row(p + i);
const float* k70 = k7.row(p + i);
g00[0] = k00[k];
g00[1] = k10[k];
g00[2] = k20[k];
g00[3] = k30[k];
g00[4] = k40[k];
g00[5] = k50[k];
g00[6] = k60[k];
g00[7] = k70[k];
g00 += 8;
}
}
}
}
#endif // __aarch64__
for (; q + 3 < outch; q += 4)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
#if __aarch64__
Mat g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
#else
Mat g0 = kernel_tm_pack4.channel(q / 4);
#endif
for (int k = 0; k < 36; k++)
{
float* g00 = g0.row(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
g00[0] = k00[k];
g00[1] = k10[k];
g00[2] = k20[k];
g00[3] = k30[k];
g00 += 4;
}
}
}
}
}
static void conv3x3s1_winograd43_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 4n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 3) / 4 * 4;
outh = (outh + 3) / 4 * 4;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 4;
int h_tiles = outh / 4;
int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 36, inch, 16u, 4, opt.workspace_allocator);
conv3x3s1_winograd43_transform_input_pack4_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack4_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, 16u, 4, opt.workspace_allocator);
}
{
conv3x3s1_winograd43_transform_output_pack4_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s1_winograd23_transform_kernel_pack4_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
// winograd23 transform kernel
Mat kernel_tm(4 * 4, inch, outch);
const float ktm[4][3] = {
{1.0f, 0.0f, 0.0f},
{1.0f / 2, 1.0f / 2, 1.0f / 2},
{1.0f / 2, -1.0f / 2, 1.0f / 2},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
for (int q = 0; q < inch; q++)
{
const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
float* kernel_tm0 = kernel_tm.channel(p).row(q);
// transform kernel
const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;
// h
float tmp[4][3];
for (int i = 0; i < 4; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}
// U
for (int j = 0; j < 4; j++)
{
float* tmpp = &tmp[j][0];
for (int i = 0; i < 4; i++)
{
kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}
// interleave
// src = 16-inch-outch
// dst = 4b-4a-inch/4a-16-outch/4b;
#if __aarch64__
kernel_tm_pack4.create(2 * inch / 4, 16, (outch / 4) / 2 + (outch / 4) % 2, (size_t)4u * 16, 16);
#else
kernel_tm_pack4.create(inch / 4, 16, outch / 4, (size_t)4u * 16, 16);
#endif
int q = 0;
#if __aarch64__
for (; q + 7 < outch; q += 8)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
const Mat k4 = kernel_tm.channel(q + 4);
const Mat k5 = kernel_tm.channel(q + 5);
const Mat k6 = kernel_tm.channel(q + 6);
const Mat k7 = kernel_tm.channel(q + 7);
Mat g0 = kernel_tm_pack4.channel(q / 8);
for (int k = 0; k < 16; k++)
{
float* g00 = g0.row(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
const float* k40 = k4.row(p + i);
const float* k50 = k5.row(p + i);
const float* k60 = k6.row(p + i);
const float* k70 = k7.row(p + i);
g00[0] = k00[k];
g00[1] = k10[k];
g00[2] = k20[k];
g00[3] = k30[k];
g00[4] = k40[k];
g00[5] = k50[k];
g00[6] = k60[k];
g00[7] = k70[k];
g00 += 8;
}
}
}
}
#endif // __aarch64__
for (; q + 3 < outch; q += 4)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
#if __aarch64__
Mat g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
#else
Mat g0 = kernel_tm_pack4.channel(q / 4);
#endif
for (int k = 0; k < 16; k++)
{
float* g00 = g0.row(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
g00[0] = k00[k];
g00[1] = k10[k];
g00[2] = k20[k];
g00[3] = k30[k];
g00 += 4;
}
}
}
}
}
static void conv3x3s1_winograd23_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 2n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 1) / 2 * 2;
outh = (outh + 1) / 2 * 2;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 2;
int h_tiles = outh / 2;
int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 16, inch, 16u, 4, opt.workspace_allocator);
conv3x3s1_winograd23_transform_input_pack4_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack4_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, 16u, 4, opt.workspace_allocator);
}
{
conv3x3s1_winograd23_transform_output_pack4_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s2_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
......
......@@ -12,183 +12,6 @@
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static void conv3x3s1_winograd63_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 6n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 5) / 6 * 6;
outh = (outh + 5) / 6 * 6;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 6;
int h_tiles = outh / 6;
const int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 64, inch, 16u, elempack, opt.workspace_allocator);
conv3x3s1_winograd63_transform_input_pack4_bf16s_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack4_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
}
{
conv3x3s1_winograd63_transform_output_pack4_bf16s_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s1_winograd43_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 4n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 3) / 4 * 4;
outh = (outh + 3) / 4 * 4;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 4;
int h_tiles = outh / 4;
const int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 36, inch, 16u, elempack, opt.workspace_allocator);
conv3x3s1_winograd43_transform_input_pack4_bf16s_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack4_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
}
{
conv3x3s1_winograd43_transform_output_pack4_bf16s_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s1_winograd23_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 2n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 1) / 2 * 2;
outh = (outh + 1) / 2 * 2;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 2;
int h_tiles = outh / 2;
const int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 16, inch, 16u, elempack, opt.workspace_allocator);
conv3x3s1_winograd23_transform_input_pack4_bf16s_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack4_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
}
{
conv3x3s1_winograd23_transform_output_pack4_bf16s_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s2_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
......
......@@ -12,583 +12,6 @@
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static void conv3x3s1_winograd63_transform_kernel_pack4_fp16sa_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
// winograd63 transform kernel
Mat kernel_tm;
kernel_tm.create(8 * 8, inch, outch);
const float ktm[8][3] = {
{1.0f, 0.0f, 0.0f},
{-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9},
{1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
for (int q = 0; q < inch; q++)
{
const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
float* kernel_tm0 = kernel_tm.channel(p).row(q);
// transform kernel, transposed
const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;
// h
float tmp[8][3];
for (int i = 0; i < 8; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}
// v
for (int j = 0; j < 8; j++)
{
float* tmpp = &tmp[j][0];
for (int i = 0; i < 8; i++)
{
kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}
// interleave
// src = 64-inch-outch
// dst = 4b-4a-inch/4a-64-outch/4b;
kernel_tm_pack4.create(2 * inch / 4, 64, (outch / 4) / 2 + (outch / 4) % 2, (size_t)2u * 16, 16);
int q = 0;
for (; q + 7 < outch; q += 8)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
const Mat k4 = kernel_tm.channel(q + 4);
const Mat k5 = kernel_tm.channel(q + 5);
const Mat k6 = kernel_tm.channel(q + 6);
const Mat k7 = kernel_tm.channel(q + 7);
Mat g0 = kernel_tm_pack4.channel(q / 8);
for (int k = 0; k < 64; k++)
{
__fp16* g00 = g0.row<__fp16>(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
const float* k40 = k4.row(p + i);
const float* k50 = k5.row(p + i);
const float* k60 = k6.row(p + i);
const float* k70 = k7.row(p + i);
g00[0] = (__fp16)k00[k];
g00[1] = (__fp16)k10[k];
g00[2] = (__fp16)k20[k];
g00[3] = (__fp16)k30[k];
g00[4] = (__fp16)k40[k];
g00[5] = (__fp16)k50[k];
g00[6] = (__fp16)k60[k];
g00[7] = (__fp16)k70[k];
g00 += 8;
}
}
}
}
for (; q + 3 < outch; q += 4)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
Mat g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
for (int k = 0; k < 64; k++)
{
__fp16* g00 = g0.row<__fp16>(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
g00[0] = (__fp16)k00[k];
g00[1] = (__fp16)k10[k];
g00[2] = (__fp16)k20[k];
g00[3] = (__fp16)k30[k];
g00 += 4;
}
}
}
}
}
static void conv3x3s1_winograd63_pack4_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 6n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 5) / 6 * 6;
outh = (outh + 5) / 6 * 6;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 6;
int h_tiles = outh / 6;
const int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
conv3x3s1_winograd63_transform_input_pack4_fp16sa_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack4_fp16sa_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, 2u * 4, 4, opt.workspace_allocator);
}
{
conv3x3s1_winograd63_transform_output_pack4_fp16sa_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s1_winograd43_transform_kernel_pack4_fp16sa_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
// winograd43 transform kernel
Mat kernel_tm(6 * 6, inch, outch);
const float ktm[6][3] = {
{1.0f / 4, 0.0f, 0.0f},
{-1.0f / 6, -1.0f / 6, -1.0f / 6},
{-1.0f / 6, 1.0f / 6, -1.0f / 6},
{1.0f / 24, 1.0f / 12, 1.0f / 6},
{1.0f / 24, -1.0f / 12, 1.0f / 6},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
for (int q = 0; q < inch; q++)
{
const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
float* kernel_tm0 = kernel_tm.channel(p).row(q);
// transform kernel
const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;
// h
float tmp[6][3];
for (int i = 0; i < 6; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}
// U
for (int j = 0; j < 6; j++)
{
float* tmpp = &tmp[j][0];
for (int i = 0; i < 6; i++)
{
kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}
// interleave
// src = 36-inch-outch
// dst = 4b-4a-inch/4a-36-outch/4b;
kernel_tm_pack4.create(2 * inch / 4, 36, (outch / 4) / 2 + (outch / 4) % 2, (size_t)2u * 16, 16);
int q = 0;
for (; q + 7 < outch; q += 8)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
const Mat k4 = kernel_tm.channel(q + 4);
const Mat k5 = kernel_tm.channel(q + 5);
const Mat k6 = kernel_tm.channel(q + 6);
const Mat k7 = kernel_tm.channel(q + 7);
Mat g0 = kernel_tm_pack4.channel(q / 8);
for (int k = 0; k < 36; k++)
{
__fp16* g00 = g0.row<__fp16>(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
const float* k40 = k4.row(p + i);
const float* k50 = k5.row(p + i);
const float* k60 = k6.row(p + i);
const float* k70 = k7.row(p + i);
g00[0] = (__fp16)k00[k];
g00[1] = (__fp16)k10[k];
g00[2] = (__fp16)k20[k];
g00[3] = (__fp16)k30[k];
g00[4] = (__fp16)k40[k];
g00[5] = (__fp16)k50[k];
g00[6] = (__fp16)k60[k];
g00[7] = (__fp16)k70[k];
g00 += 8;
}
}
}
}
for (; q + 3 < outch; q += 4)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
Mat g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
for (int k = 0; k < 36; k++)
{
__fp16* g00 = g0.row<__fp16>(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
g00[0] = (__fp16)k00[k];
g00[1] = (__fp16)k10[k];
g00[2] = (__fp16)k20[k];
g00[3] = (__fp16)k30[k];
g00 += 4;
}
}
}
}
}
static void conv3x3s1_winograd43_pack4_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 4n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 3) / 4 * 4;
outh = (outh + 3) / 4 * 4;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 4;
int h_tiles = outh / 4;
const int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
conv3x3s1_winograd43_transform_input_pack4_fp16sa_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack4_fp16sa_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, 2u * 4, 4, opt.workspace_allocator);
}
{
conv3x3s1_winograd43_transform_output_pack4_fp16sa_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s1_winograd23_transform_kernel_pack4_fp16sa_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
// winograd23 transform kernel
Mat kernel_tm(4 * 4, inch, outch);
const float ktm[4][3] = {
{1.0f, 0.0f, 0.0f},
{1.0f / 2, 1.0f / 2, 1.0f / 2},
{1.0f / 2, -1.0f / 2, 1.0f / 2},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
for (int q = 0; q < inch; q++)
{
const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
float* kernel_tm0 = kernel_tm.channel(p).row(q);
// transform kernel
const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;
// h
float tmp[4][3];
for (int i = 0; i < 4; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}
// U
for (int j = 0; j < 4; j++)
{
float* tmpp = &tmp[j][0];
for (int i = 0; i < 4; i++)
{
kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}
// interleave
// src = 16-inch-outch
// dst = 4b-4a-inch/4a-16-outch/4b;
kernel_tm_pack4.create(2 * inch / 4, 16, (outch / 4) / 2 + (outch / 4) % 2, (size_t)2u * 16, 16);
int q = 0;
for (; q + 7 < outch; q += 8)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
const Mat k4 = kernel_tm.channel(q + 4);
const Mat k5 = kernel_tm.channel(q + 5);
const Mat k6 = kernel_tm.channel(q + 6);
const Mat k7 = kernel_tm.channel(q + 7);
Mat g0 = kernel_tm_pack4.channel(q / 8);
for (int k = 0; k < 16; k++)
{
__fp16* g00 = g0.row<__fp16>(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
const float* k40 = k4.row(p + i);
const float* k50 = k5.row(p + i);
const float* k60 = k6.row(p + i);
const float* k70 = k7.row(p + i);
g00[0] = (__fp16)k00[k];
g00[1] = (__fp16)k10[k];
g00[2] = (__fp16)k20[k];
g00[3] = (__fp16)k30[k];
g00[4] = (__fp16)k40[k];
g00[5] = (__fp16)k50[k];
g00[6] = (__fp16)k60[k];
g00[7] = (__fp16)k70[k];
g00 += 8;
}
}
}
}
for (; q + 3 < outch; q += 4)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
Mat g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
for (int k = 0; k < 16; k++)
{
__fp16* g00 = g0.row<__fp16>(k);
for (int p = 0; p + 3 < inch; p += 4)
{
for (int i = 0; i < 4; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
g00[0] = (__fp16)k00[k];
g00[1] = (__fp16)k10[k];
g00[2] = (__fp16)k20[k];
g00[3] = (__fp16)k30[k];
g00 += 4;
}
}
}
}
}
static void conv3x3s1_winograd23_pack4_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 2n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 1) / 2 * 2;
outh = (outh + 1) / 2 * 2;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 2;
int h_tiles = outh / 2;
const int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 16, inch, elemsize, elempack, opt.workspace_allocator);
conv3x3s1_winograd23_transform_input_pack4_fp16sa_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack4_fp16sa_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, 2u * 4, 4, opt.workspace_allocator);
}
{
conv3x3s1_winograd23_transform_output_pack4_fp16sa_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s1_pack4_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
int inch = bottom_blob.c;
......
......@@ -12,487 +12,6 @@
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static void conv3x3s1_winograd63_transform_kernel_pack8_fp16sa_neon(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt)
{
// winograd63 transform kernel
Mat kernel_tm;
kernel_tm.create(8 * 8, inch, outch);
const float ktm[8][3] = {
{1.0f, 0.0f, 0.0f},
{-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9},
{1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
for (int q = 0; q < inch; q++)
{
const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
float* kernel_tm0 = kernel_tm.channel(p).row(q);
// transform kernel, transposed
const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;
// h
float tmp[8][3];
for (int i = 0; i < 8; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}
// v
for (int j = 0; j < 8; j++)
{
float* tmpp = &tmp[j][0];
for (int i = 0; i < 8; i++)
{
kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}
// interleave
// src = 64-inch-outch
// dst = 8b-8a-inch/8a-64-outch/8b
kernel_tm_pack8.create(inch / 8, 64, outch / 8, (size_t)2u * 64, 64);
int q = 0;
for (; q + 7 < outch; q += 8)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
const Mat k4 = kernel_tm.channel(q + 4);
const Mat k5 = kernel_tm.channel(q + 5);
const Mat k6 = kernel_tm.channel(q + 6);
const Mat k7 = kernel_tm.channel(q + 7);
Mat g0 = kernel_tm_pack8.channel(q / 8);
for (int k = 0; k < 64; k++)
{
__fp16* g00 = g0.row<__fp16>(k);
for (int p = 0; p + 7 < inch; p += 8)
{
for (int i = 0; i < 8; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
const float* k40 = k4.row(p + i);
const float* k50 = k5.row(p + i);
const float* k60 = k6.row(p + i);
const float* k70 = k7.row(p + i);
g00[0] = (__fp16)k00[k];
g00[1] = (__fp16)k10[k];
g00[2] = (__fp16)k20[k];
g00[3] = (__fp16)k30[k];
g00[4] = (__fp16)k40[k];
g00[5] = (__fp16)k50[k];
g00[6] = (__fp16)k60[k];
g00[7] = (__fp16)k70[k];
g00 += 8;
}
}
}
}
}
static void conv3x3s1_winograd63_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 6n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 5) / 6 * 6;
outh = (outh + 5) / 6 * 6;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 6;
int h_tiles = outh / 6;
const int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
conv3x3s1_winograd63_transform_input_pack8_fp16sa_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack8_fp16sa_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
}
{
conv3x3s1_winograd63_transform_output_pack8_fp16sa_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s1_winograd43_transform_kernel_pack8_fp16sa_neon(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt)
{
// winograd43 transform kernel
Mat kernel_tm(6 * 6, inch, outch);
const float ktm[6][3] = {
{1.0f / 4, 0.0f, 0.0f},
{-1.0f / 6, -1.0f / 6, -1.0f / 6},
{-1.0f / 6, 1.0f / 6, -1.0f / 6},
{1.0f / 24, 1.0f / 12, 1.0f / 6},
{1.0f / 24, -1.0f / 12, 1.0f / 6},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
for (int q = 0; q < inch; q++)
{
const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
float* kernel_tm0 = kernel_tm.channel(p).row(q);
// transform kernel
const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;
// h
float tmp[6][3];
for (int i = 0; i < 6; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}
// U
for (int j = 0; j < 6; j++)
{
float* tmpp = &tmp[j][0];
for (int i = 0; i < 6; i++)
{
kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}
// interleave
// src = 36-inch-outch
// dst = 8b-8a-inch/8a-36-outch/8b
kernel_tm_pack8.create(inch / 8, 36, outch / 8, (size_t)2u * 64, 64);
int q = 0;
for (; q + 7 < outch; q += 8)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
const Mat k4 = kernel_tm.channel(q + 4);
const Mat k5 = kernel_tm.channel(q + 5);
const Mat k6 = kernel_tm.channel(q + 6);
const Mat k7 = kernel_tm.channel(q + 7);
Mat g0 = kernel_tm_pack8.channel(q / 8);
for (int k = 0; k < 36; k++)
{
__fp16* g00 = g0.row<__fp16>(k);
for (int p = 0; p + 7 < inch; p += 8)
{
for (int i = 0; i < 8; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
const float* k40 = k4.row(p + i);
const float* k50 = k5.row(p + i);
const float* k60 = k6.row(p + i);
const float* k70 = k7.row(p + i);
g00[0] = (__fp16)k00[k];
g00[1] = (__fp16)k10[k];
g00[2] = (__fp16)k20[k];
g00[3] = (__fp16)k30[k];
g00[4] = (__fp16)k40[k];
g00[5] = (__fp16)k50[k];
g00[6] = (__fp16)k60[k];
g00[7] = (__fp16)k70[k];
g00 += 8;
}
}
}
}
}
static void conv3x3s1_winograd43_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 4n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 3) / 4 * 4;
outh = (outh + 3) / 4 * 4;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 4;
int h_tiles = outh / 4;
const int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
conv3x3s1_winograd43_transform_input_pack8_fp16sa_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack8_fp16sa_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
}
{
conv3x3s1_winograd43_transform_output_pack8_fp16sa_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s1_winograd23_transform_kernel_pack8_fp16sa_neon(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt)
{
// winograd23 transform kernel
Mat kernel_tm(4 * 4, inch, outch);
const float ktm[4][3] = {
{1.0f, 0.0f, 0.0f},
{1.0f / 2, 1.0f / 2, 1.0f / 2},
{1.0f / 2, -1.0f / 2, 1.0f / 2},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
for (int q = 0; q < inch; q++)
{
const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
float* kernel_tm0 = kernel_tm.channel(p).row(q);
// transform kernel
const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;
// h
float tmp[4][3];
for (int i = 0; i < 4; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}
// U
for (int j = 0; j < 4; j++)
{
float* tmpp = &tmp[j][0];
for (int i = 0; i < 4; i++)
{
kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}
// interleave
// src = 16-inch-outch
// dst = 8b-8a-inch/8a-16-outch/8b
kernel_tm_pack8.create(inch / 8, 16, outch / 8, (size_t)2u * 64, 64);
int q = 0;
for (; q + 7 < outch; q += 8)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);
const Mat k4 = kernel_tm.channel(q + 4);
const Mat k5 = kernel_tm.channel(q + 5);
const Mat k6 = kernel_tm.channel(q + 6);
const Mat k7 = kernel_tm.channel(q + 7);
Mat g0 = kernel_tm_pack8.channel(q / 8);
for (int k = 0; k < 16; k++)
{
__fp16* g00 = g0.row<__fp16>(k);
for (int p = 0; p + 7 < inch; p += 8)
{
for (int i = 0; i < 8; i++)
{
const float* k00 = k0.row(p + i);
const float* k10 = k1.row(p + i);
const float* k20 = k2.row(p + i);
const float* k30 = k3.row(p + i);
const float* k40 = k4.row(p + i);
const float* k50 = k5.row(p + i);
const float* k60 = k6.row(p + i);
const float* k70 = k7.row(p + i);
g00[0] = (__fp16)k00[k];
g00[1] = (__fp16)k10[k];
g00[2] = (__fp16)k20[k];
g00[3] = (__fp16)k30[k];
g00[4] = (__fp16)k40[k];
g00[5] = (__fp16)k50[k];
g00[6] = (__fp16)k60[k];
g00[7] = (__fp16)k70[k];
g00 += 8;
}
}
}
}
}
static void conv3x3s1_winograd23_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
// pad to 2n+2
Mat bottom_blob_bordered = bottom_blob;
outw = (outw + 1) / 2 * 2;
outh = (outh + 1) / 2 * 2;
w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tiles = outw / 2;
int h_tiles = outh / 2;
const int tiles = w_tiles * h_tiles;
bottom_blob_tm.create(tiles, 16, inch, elemsize, elempack, opt.workspace_allocator);
conv3x3s1_winograd23_transform_input_pack8_fp16sa_neon(bottom_blob_bordered, bottom_blob_tm, opt);
}
bottom_blob_bordered = Mat();
// END transform input
// BEGIN dot
Mat top_blob_tm;
convolution_winograd_dot_pack8_fp16sa_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
// END dot
// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
}
{
conv3x3s1_winograd23_transform_output_pack8_fp16sa_neon(top_blob_tm, top_blob_bordered, bias, opt);
}
// END transform output
// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}
static void conv3x3s1_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
int inch = bottom_blob.c;
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -50,6 +50,8 @@ protected:
public:
Layer* activation;
int nT;
Mat weight_data_tm;
Mat weight_3x3s2_data;
......
......@@ -43,11 +43,9 @@ namespace ncnn {
#include "convolution_sgemm_pack8_fp16s.h"
#include "convolution_sgemm_pack8to4_fp16s.h"
#include "convolution_sgemm_pack8to1_fp16s.h"
#include "convolution_winograd_transform_fp16s.h"
#include "convolution_winograd_transform_pack4_fp16s.h"
#include "convolution_winograd_transform_pack8_fp16s.h"
#include "convolution_winograd_dot_pack4_fp16s.h"
#include "convolution_winograd_dot_pack8_fp16s.h"
#include "convolution_3x3_winograd_fp16s.h"
#include "convolution_1x1_fp16s.h"
#include "convolution_1x1_pack4_fp16s.h"
#include "convolution_1x1_pack1to4_fp16s.h"
......@@ -59,8 +57,6 @@ namespace ncnn {
#include "convolution_3x3_pack1to8_fp16s.h"
#include "convolution_3x3_pack1to4_fp16s.h"
#include "convolution_3x3_pack8_fp16s.h"
#include "convolution_3x3_pack8to1_fp16s.h"
#include "convolution_3x3_pack8to4_fp16s.h"
#include "convolution_5x5_pack8_fp16s.h"
#include "convolution_7x7_pack1to8_fp16s.h"
#endif
......@@ -117,6 +113,31 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
}
bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 16 || num_output >= 16);
if (opt.use_fp16_arithmetic && opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
// dynamic shape
if (opt.use_winograd63_convolution && (num_input <= 128 && num_output <= 128))
conv3x3s1_winograd63_transform_kernel_fp16sa(weight_data, weight_winograd63_data, num_input, num_output, opt);
else if (opt.use_winograd43_convolution && (num_input >= 16 && num_output >= 16))
conv3x3s1_winograd43_transform_kernel_fp16sa(weight_data, weight_winograd43_data, num_input, num_output, opt);
else
conv3x3s1_winograd23_transform_kernel_fp16sa(weight_data, weight_winograd23_data, num_input, num_output, opt);
if (opt.lightmode)
{
weight_data.release();
}
if (opt.use_fp16_arithmetic)
{
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
}
return 0;
}
// pack8
if (elempack == 8 && out_elempack == 8)
{
......@@ -128,15 +149,6 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
{
convolution_im2col_sgemm_transform_kernel_pack8_fp16sa_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (num_input > 8 || num_output > 8))
{
if ((opt.use_winograd63_convolution && num_input >= 16 && num_output >= 16 && num_input <= 128 && num_output <= 128) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
conv3x3s1_winograd63_transform_kernel_pack8_fp16sa_neon(weight_data, weight_winograd63_data, num_input, num_output, opt);
else if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
conv3x3s1_winograd43_transform_kernel_pack8_fp16sa_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
else // if (opt.use_winograd23_convolution)
conv3x3s1_winograd23_transform_kernel_pack8_fp16sa_neon(weight_data, weight_winograd23_data, num_input, num_output, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_transform_kernel_packed_fp16s_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
......@@ -215,10 +227,6 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_fp16sa_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_winograd_convolution && opt.use_winograd63_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd63_transform_kernel_pack8to1_fp16sa_neon(weight_data, weight_winograd63_data, num_input, num_output, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_fp16sa_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
......@@ -240,10 +248,6 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_fp16sa_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_winograd_convolution && opt.use_winograd63_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd63_transform_kernel_pack8to4_fp16sa_neon(weight_data, weight_winograd63_data, num_input, num_output, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_fp16sa_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
......@@ -265,15 +269,6 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
{
convolution_im2col_sgemm_transform_kernel_pack4_fp16sa_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_fp16_arithmetic && opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (num_input > 4 || num_output > 4))
{
if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
conv3x3s1_winograd63_transform_kernel_pack4_fp16sa_neon(weight_data, weight_winograd63_data, num_input, num_output, opt);
else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
conv3x3s1_winograd43_transform_kernel_pack4_fp16sa_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
else // if (opt.use_winograd23_convolution)
conv3x3s1_winograd23_transform_kernel_pack4_fp16sa_neon(weight_data, weight_winograd23_data, num_input, num_output, opt);
}
else if (opt.use_fp16_arithmetic && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_transform_kernel_packed_fp16s_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
......@@ -457,34 +452,89 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
const int num_input = channels * elempack;
if (elempack == 8 && out_elempack == 8)
bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 16 || num_output >= 16);
if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
bool prefer_winograd63 = false;
bool prefer_winograd23 = false;
bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;
if (prefer_winograd23 && (!opt.use_winograd23_convolution || weight_winograd23_data.empty()))
{
conv1x1s1_sgemm_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, opt);
// f23 fallback to f43
prefer_winograd23 = false;
prefer_winograd43 = true;
}
if (activation)
if (prefer_winograd63 && (!opt.use_winograd63_convolution || weight_winograd63_data.empty()))
{
// f63 fallback to f43
prefer_winograd63 = false;
prefer_winograd43 = true;
}
if (prefer_winograd43 && (!opt.use_winograd43_convolution || weight_winograd43_data.empty()))
{
// f43 fallback to f63 or f23
prefer_winograd43 = false;
if (opt.use_winograd63_convolution && !weight_winograd63_data.empty())
{
activation->forward_inplace(top_blob, opt);
prefer_winograd63 = true;
}
else
{
prefer_winograd23 = true;
}
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
// NCNN_LOGE("prefer_winograd %d %d %d", prefer_winograd23, prefer_winograd43, prefer_winograd63);
int _nT = nT ? nT : opt.num_threads;
if (nT != 0 && opt.num_threads != nT)
{
conv1x1s2_sgemm_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, opt);
// force num_threads the same as in create_pipeline
// so we could use pre-packed A/B from the same tile config
NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
}
if (prefer_winograd23)
{
conv3x3s1_winograd23_fp16sa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, _nT, opt);
}
else if (prefer_winograd43)
{
conv3x3s1_winograd43_fp16sa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, _nT, opt);
}
else if (prefer_winograd63)
{
conv3x3s1_winograd63_fp16sa(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, _nT, opt);
}
else
{
// should never reach here
}
if (activation)
{
activation->forward_inplace(top_blob, opt);
}
return 0;
}
if (elempack == 8 && out_elempack == 8)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, opt);
if (activation)
{
activation->forward_inplace(top_blob, opt);
}
}
else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (num_input > 8 || num_output > 8))
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
if ((opt.use_winograd63_convolution && num_input >= 16 && num_output >= 16 && num_input <= 128 && num_output <= 128) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
conv3x3s1_winograd63_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, opt);
else if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
conv3x3s1_winograd43_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, opt);
else // if (opt.use_winograd23_convolution)
conv3x3s1_winograd23_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, opt);
conv1x1s2_sgemm_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, opt);
if (activation)
{
......@@ -646,18 +696,6 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
activation->forward_inplace(top_blob, opt);
}
}
else if (opt.use_winograd_convolution && opt.use_winograd63_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
// TODO more proper condition
conv3x3s1_winograd63_pack8to1_fp16sa_neon(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, opt);
// conv3x3s1_pack8to1_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
if (activation)
{
activation->forward_inplace(top_blob, opt);
}
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_pack8to1_fp16sa_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
......@@ -693,18 +731,6 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
activation->forward_inplace(top_blob, opt);
}
}
else if (opt.use_winograd_convolution && opt.use_winograd63_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
// TODO more proper condition
conv3x3s1_winograd63_pack8to4_fp16sa_neon(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, opt);
// conv3x3s1_pack8to4_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
if (activation)
{
activation->forward_inplace(top_blob, opt);
}
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_pack8to4_fp16sa_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
......@@ -740,20 +766,6 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
activation->forward_inplace(top_blob, opt);
}
}
else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (num_input > 4 || num_output > 4))
{
if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
conv3x3s1_winograd63_pack4_fp16sa_neon(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, opt);
else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
conv3x3s1_winograd43_pack4_fp16sa_neon(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, opt);
else // if (opt.use_winograd23_convolution)
conv3x3s1_winograd23_pack4_fp16sa_neon(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, opt);
if (activation)
{
activation->forward_inplace(top_blob, opt);
}
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_pack4_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);
......
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册