...
 
Commits (6)
    https://gitcode.net/wjd2002/ncnn/-/commit/5e50270e0537e577cbcbf43826039edfac13ec4f Update KunPeng 920 Platform (#4847) 2023-07-10T15:01:06+08:00 Zhang Geng mobtgzhang@outlook.com https://gitcode.net/wjd2002/ncnn/-/commit/f1943fd84737564408c1e1458eb5018a4437154a Bump pypa/cibuildwheel from 2.13.0 to 2.13.1 (#4796) 2023-07-10T15:24:35+08:00 dependabot[bot] 49699333+dependabot[bot]@users.noreply.github.com Bumps [pypa/cibuildwheel](<a href="https://github.com/pypa/cibuildwheel" rel="nofollow noreferrer noopener" target="_blank">https://github.com/pypa/cibuildwheel</a>) from 2.13.0 to 2.13.1. - [Release notes](<a href="https://github.com/pypa/cibuildwheel/releases" rel="nofollow noreferrer noopener" target="_blank">https://github.com/pypa/cibuildwheel/releases</a>) - [Changelog](<a href="https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md" rel="nofollow noreferrer noopener" target="_blank">https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md</a>) - [Commits](<a href="https://github.com/pypa/cibuildwheel/compare/v2.13.0...v2.13.1" rel="nofollow noreferrer noopener" target="_blank">https://github.com/pypa/cibuildwheel/compare/v2.13.0...v2.13.1</a>) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: <span data-trailer="Signed-off-by:"><a href="mailto:support@github.com" title="support@github.com"></a><a href="javascript:void(0)" class="avatar s16 avatar-inline identicon bg2" style="text-decoration: none">N</a><a href="mailto:support@github.com" title="support@github.com">dependabot[bot]</a> &lt;<a href="mailto:support@github.com" title="support@github.com">support@github.com</a>&gt;</span> Co-authored-by: <span data-trailer="Co-authored-by:"><a href="mailto:49699333+dependabot%5Bbot%5D@users.noreply.github.com" title="49699333+dependabot[bot]@users.noreply.github.com"></a><a href="javascript:void(0)" class="avatar s16 avatar-inline identicon bg1" style="text-decoration: none">N</a><a href="mailto:49699333+dependabot%5Bbot%5D@users.noreply.github.com" title="49699333+dependabot[bot]@users.noreply.github.com">dependabot[bot]</a> &lt;<a href="mailto:49699333+dependabot%5Bbot%5D@users.noreply.github.com" title="49699333+dependabot[bot]@users.noreply.github.com">49699333+dependabot[bot]@users.noreply.github.com</a>&gt;</span> https://gitcode.net/wjd2002/ncnn/-/commit/a87be247953d45842458d182e1b8d57ca8c0c6c5 pnnx convert conv with non-zero padding mode (#4849) 2023-07-10T23:21:07+08:00 nihui nihuini@tencent.com https://gitcode.net/wjd2002/ncnn/-/commit/472244420e6dfd12d0bf7309ca66a78e5ac28974 VSX toolchains: check for SSE2 support (#4845) 2023-07-11T15:56:51+08:00 JeremyRand 244188+JeremyRand@users.noreply.github.com Improves compatibility with Clang 11. Also rename NCNN_SSE* options to NCNN_VSX_SSE* to avoid conflict between x86 and POWER (went unnoticed before because x86 doesn't have an option for toggling SSE 4.1). Co-authored-by: <span data-trailer="Co-authored-by:"><a href="mailto:jeremyrand@danwin1210.de" title="jeremyrand@danwin1210.de"></a><a href="javascript:void(0)" class="avatar s16 avatar-inline identicon bg3" style="text-decoration: none">N</a><a href="mailto:jeremyrand@danwin1210.de" title="jeremyrand@danwin1210.de">Jeremy Rand</a> &lt;<a href="mailto:jeremyrand@danwin1210.de" title="jeremyrand@danwin1210.de">jeremyrand@danwin1210.de</a>&gt;</span> https://gitcode.net/wjd2002/ncnn/-/commit/810bfbac6ed4dc8834b3c30d68ee408aae806e3f pnnx eliminate noop expand and expand_as (#4850) 2023-07-11T17:26:07+08:00 nihui nihuini@tencent.com https://gitcode.net/wjd2002/ncnn/-/commit/e8645e9117cd926530c405b103b7afb984c7173b Don't silently ignore errors in VkCompute::submit_and_wait (#4828) 2023-07-11T17:47:02+08:00 Upliner Mikhalych upliner@gmail.com
......@@ -68,7 +68,7 @@ jobs:
brew uninstall --ignore-dependencies libomp
- name: Build wheels
uses: pypa/cibuildwheel@v2.13.0
uses: pypa/cibuildwheel@v2.13.1
env:
CIBW_ARCHS_MACOS: ${{ matrix.arch }}
CIBW_ARCHS_LINUX: ${{ matrix.arch }}
......@@ -122,7 +122,7 @@ jobs:
platforms: all
- name: Build wheels
uses: pypa/cibuildwheel@v2.13.0
uses: pypa/cibuildwheel@v2.13.1
env:
CIBW_ARCHS_LINUX: ${{ matrix.arch }}
CIBW_BUILD: ${{ matrix.build }}
......
......@@ -368,14 +368,24 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
if(NCNN_PPC64LE_VSX)
set(NCNN_TARGET_ARCH x86)
set(CMAKE_REQUIRED_FLAGS "-DNO_WARN_X86_INTRINSICS -D__SSE2__")
check_cxx_source_compiles("#include <emmintrin.h>\nint main() { return 0; }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE2)
unset(CMAKE_REQUIRED_FLAGS)
set(CMAKE_REQUIRED_FLAGS "-DNO_WARN_X86_INTRINSICS -D__SSE4_1__")
check_cxx_source_compiles("#include <smmintrin.h>\nint main() { __m128i _v, _a, _b; _v = _mm_packus_epi32(_a, _b); return 0; }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE41)
unset(CMAKE_REQUIRED_FLAGS)
if(NCNN_COMPILER_SUPPORT_PPC64LE_SSE2)
option(NCNN_VSX_SSE2 "optimize ppc64le platform with sse2 extension" ON)
else()
message(WARNING "The compiler does not support sse2 extension. NCNN_VSX_SSE2 will be OFF.")
endif()
if(NCNN_COMPILER_SUPPORT_PPC64LE_SSE41)
option(NCNN_SSE41 "optimize ppc64le platform with sse4.1 extension" ON)
option(NCNN_VSX_SSE41 "optimize ppc64le platform with sse4.1 extension" ON)
else()
message(WARNING "The compiler does not support sse4.1 extension. NCNN_SSE41 will be OFF.")
message(WARNING "The compiler does not support sse4.1 extension. NCNN_VSX_SSE41 will be OFF.")
endif()
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(xtensa)")
......
......@@ -4231,6 +4231,257 @@ cooling_down = 0
FastestDet min = 3.59 max = 3.61 avg = 3.60
```
### HUAWEI KunPeng 920 3211K (x24 cores)
test on ubuntu 22.04
```
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
squeezenet min = 12.11 max = 12.20 avg = 12.14
squeezenet_int8 min = 14.34 max = 14.46 avg = 14.41
mobilenet min = 20.27 max = 20.36 avg = 20.31
mobilenet_int8 min = 17.45 max = 17.74 avg = 17.58
mobilenet_v2 min = 13.72 max = 13.87 avg = 13.78
mobilenet_v3 min = 11.51 max = 11.69 avg = 11.61
shufflenet min = 8.07 max = 8.36 avg = 8.20
shufflenet_v2 min = 8.13 max = 8.17 avg = 8.14
mnasnet min = 13.34 max = 13.45 avg = 13.41
proxylessnasnet min = 16.22 max = 16.35 avg = 16.29
efficientnet_b0 min = 34.69 max = 35.14 avg = 34.82
efficientnetv2_b0 min = 44.54 max = 44.68 avg = 44.61
regnety_400m min = 18.06 max = 18.15 avg = 18.10
blazeface min = 3.06 max = 3.22 avg = 3.12
googlenet min = 56.80 max = 57.60 avg = 57.08
googlenet_int8 min = 58.64 max = 59.98 avg = 59.42
resnet18 min = 35.02 max = 35.35 avg = 35.10
resnet18_int8 min = 61.13 max = 61.68 avg = 61.33
alexnet min = 42.56 max = 43.05 avg = 42.69
vgg16 min = 186.32 max = 188.73 avg = 187.20
vgg16_int8 min = 459.01 max = 461.48 avg = 460.29
resnet50 min = 97.59 max = 98.32 avg = 97.83
resnet50_int8 min = 118.67 max = 120.45 avg = 119.78
squeezenet_ssd min = 39.62 max = 39.95 avg = 39.81
squeezenet_ssd_int8 min = 56.72 max = 57.63 avg = 57.00
mobilenet_ssd min = 45.44 max = 45.82 avg = 45.63
mobilenet_ssd_int8 min = 38.99 max = 40.08 avg = 39.39
mobilenet_yolo min = 98.71 max = 99.27 avg = 98.94
mobilenetv2_yolov3 min = 51.50 max = 52.41 avg = 51.87
yolov4-tiny min = 68.02 max = 68.43 avg = 68.24
nanodet_m min = 20.49 max = 20.64 avg = 20.59
yolo-fastest-1.1 min = 8.17 max = 8.45 avg = 8.23
yolo-fastestv2 min = 7.73 max = 8.06 avg = 7.87
vision_transformer min = 1620.65 max = 1630.45 avg = 1625.64
FastestDet min = 7.65 max = 7.77 avg = 7.69
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 2 0 -1 0
loop_count = 10
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 0
squeezenet min = 6.77 max = 6.85 avg = 6.81
squeezenet_int8 min = 7.98 max = 8.07 avg = 8.03
mobilenet min = 10.70 max = 10.78 avg = 10.73
mobilenet_int8 min = 9.21 max = 9.36 avg = 9.28
mobilenet_v2 min = 7.91 max = 7.99 avg = 7.94
mobilenet_v3 min = 6.72 max = 6.92 avg = 6.78
shufflenet min = 5.34 max = 5.55 avg = 5.38
shufflenet_v2 min = 5.12 max = 5.15 avg = 5.14
mnasnet min = 7.74 max = 7.86 avg = 7.80
proxylessnasnet min = 9.00 max = 9.03 avg = 9.02
efficientnet_b0 min = 18.51 max = 18.58 avg = 18.54
efficientnetv2_b0 min = 23.68 max = 23.83 avg = 23.74
regnety_400m min = 12.65 max = 12.68 avg = 12.66
blazeface min = 1.99 max = 2.14 avg = 2.03
googlenet min = 30.83 max = 31.29 avg = 30.91
googlenet_int8 min = 31.97 max = 33.12 avg = 32.45
resnet18 min = 18.81 max = 18.87 avg = 18.84
resnet18_int8 min = 32.80 max = 32.99 avg = 32.90
alexnet min = 22.88 max = 23.16 avg = 22.94
vgg16 min = 100.58 max = 101.12 avg = 100.90
vgg16_int8 min = 235.81 max = 237.97 avg = 236.20
resnet50 min = 51.12 max = 51.43 avg = 51.28
resnet50_int8 min = 62.46 max = 63.02 avg = 62.72
squeezenet_ssd min = 23.26 max = 23.73 avg = 23.38
squeezenet_ssd_int8 min = 31.91 max = 32.30 avg = 32.13
mobilenet_ssd min = 24.73 max = 24.95 avg = 24.84
mobilenet_ssd_int8 min = 20.99 max = 21.52 avg = 21.21
mobilenet_yolo min = 54.91 max = 55.70 avg = 55.15
mobilenetv2_yolov3 min = 30.18 max = 30.52 avg = 30.31
yolov4-tiny min = 40.46 max = 40.61 avg = 40.55
nanodet_m min = 12.56 max = 12.72 avg = 12.62
yolo-fastest-1.1 min = 6.00 max = 6.15 avg = 6.04
yolo-fastestv2 min = 5.32 max = 5.59 avg = 5.43
vision_transformer min = 894.51 max = 896.28 avg = 895.57
FastestDet min = 5.33 max = 5.42 avg = 5.36
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 4 0 -1 0
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 0
squeezenet min = 4.18 max = 4.35 avg = 4.22
squeezenet_int8 min = 4.85 max = 4.98 avg = 4.89
mobilenet min = 5.80 max = 5.95 avg = 5.89
mobilenet_int8 min = 4.86 max = 4.94 avg = 4.89
mobilenet_v2 min = 4.66 max = 4.73 avg = 4.69
mobilenet_v3 min = 4.46 max = 4.50 avg = 4.48
shufflenet min = 4.01 max = 4.17 avg = 4.04
shufflenet_v2 min = 3.39 max = 3.41 avg = 3.39
mnasnet min = 4.81 max = 4.93 avg = 4.85
proxylessnasnet min = 5.47 max = 5.54 avg = 5.49
efficientnet_b0 min = 10.49 max = 10.55 avg = 10.52
efficientnetv2_b0 min = 13.67 max = 13.77 avg = 13.72
regnety_400m min = 10.20 max = 10.24 avg = 10.21
blazeface min = 1.52 max = 1.58 avg = 1.54
googlenet min = 17.65 max = 17.69 avg = 17.68
googlenet_int8 min = 18.14 max = 18.27 avg = 18.19
resnet18 min = 10.52 max = 10.63 avg = 10.57
resnet18_int8 min = 17.42 max = 17.53 avg = 17.49
alexnet min = 13.12 max = 13.20 avg = 13.16
vgg16 min = 55.24 max = 55.45 avg = 55.35
vgg16_int8 min = 123.46 max = 124.23 avg = 123.75
resnet50 min = 28.31 max = 28.57 avg = 28.39
resnet50_int8 min = 34.10 max = 34.39 avg = 34.23
squeezenet_ssd min = 14.85 max = 14.96 avg = 14.91
squeezenet_ssd_int8 min = 19.71 max = 19.88 avg = 19.82
mobilenet_ssd min = 13.49 max = 13.58 avg = 13.52
mobilenet_ssd_int8 min = 11.60 max = 11.70 avg = 11.66
mobilenet_yolo min = 31.74 max = 31.96 avg = 31.81
mobilenetv2_yolov3 min = 17.87 max = 18.03 avg = 17.93
yolov4-tiny min = 25.63 max = 25.78 avg = 25.72
nanodet_m min = 8.16 max = 8.22 avg = 8.20
yolo-fastest-1.1 min = 4.72 max = 4.86 avg = 4.75
yolo-fastestv2 min = 3.98 max = 4.15 avg = 4.00
vision_transformer min = 501.18 max = 503.51 avg = 502.12
FastestDet min = 3.74 max = 3.76 avg = 3.75
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 8 0 -1 0
loop_count = 10
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 0
squeezenet min = 2.91 max = 3.10 avg = 2.97
squeezenet_int8 min = 3.42 max = 3.74 avg = 3.51
mobilenet min = 3.57 max = 3.70 avg = 3.61
mobilenet_int8 min = 3.06 max = 3.14 avg = 3.10
mobilenet_v2 min = 3.73 max = 3.75 avg = 3.75
mobilenet_v3 min = 3.50 max = 3.66 avg = 3.56
shufflenet min = 3.63 max = 3.65 avg = 3.64
shufflenet_v2 min = 2.85 max = 3.02 avg = 2.95
mnasnet min = 3.60 max = 3.67 avg = 3.62
proxylessnasnet min = 4.00 max = 4.08 avg = 4.03
efficientnet_b0 min = 7.31 max = 7.34 avg = 7.33
efficientnetv2_b0 min = 9.44 max = 9.51 avg = 9.47
regnety_400m min = 9.76 max = 10.07 avg = 9.90
blazeface min = 1.56 max = 1.75 avg = 1.61
googlenet min = 11.22 max = 11.28 avg = 11.25
googlenet_int8 min = 11.40 max = 12.82 avg = 11.76
resnet18 min = 6.83 max = 6.96 avg = 6.90
resnet18_int8 min = 10.28 max = 10.38 avg = 10.33
alexnet min = 8.75 max = 8.88 avg = 8.80
vgg16 min = 36.00 max = 36.72 avg = 36.29
vgg16_int8 min = 67.38 max = 67.72 avg = 67.54
resnet50 min = 17.63 max = 17.82 avg = 17.68
resnet50_int8 min = 20.05 max = 20.21 avg = 20.15
squeezenet_ssd min = 11.18 max = 11.45 avg = 11.26
squeezenet_ssd_int8 min = 14.09 max = 14.23 avg = 14.18
mobilenet_ssd min = 8.60 max = 8.69 avg = 8.64
mobilenet_ssd_int8 min = 7.75 max = 7.87 avg = 7.81
mobilenet_yolo min = 21.97 max = 22.25 avg = 22.09
mobilenetv2_yolov3 min = 14.04 max = 14.18 avg = 14.12
yolov4-tiny min = 19.66 max = 19.93 avg = 19.81
nanodet_m min = 6.52 max = 6.67 avg = 6.57
yolo-fastest-1.1 min = 4.61 max = 4.76 avg = 4.66
yolo-fastestv2 min = 3.78 max = 3.91 avg = 3.82
vision_transformer min = 323.01 max = 327.38 avg = 323.75
FastestDet min = 3.50 max = 3.54 avg = 3.51
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 16 0 -1 0
loop_count = 10
num_threads = 16
powersave = 0
gpu_device = -1
cooling_down = 0
squeezenet min = 3.00 max = 3.25 avg = 3.08
squeezenet_int8 min = 4.13 max = 4.47 avg = 4.21
mobilenet min = 3.27 max = 3.42 avg = 3.34
mobilenet_int8 min = 3.49 max = 3.58 avg = 3.56
mobilenet_v2 min = 3.86 max = 4.10 avg = 3.97
mobilenet_v3 min = 3.72 max = 3.80 avg = 3.76
shufflenet min = 4.67 max = 4.78 avg = 4.72
shufflenet_v2 min = 3.16 max = 3.24 avg = 3.20
mnasnet min = 3.51 max = 3.65 avg = 3.57
proxylessnasnet min = 4.08 max = 4.35 avg = 4.15
efficientnet_b0 min = 7.51 max = 7.80 avg = 7.63
efficientnetv2_b0 min = 8.92 max = 9.39 avg = 9.05
regnety_400m min = 14.80 max = 15.05 avg = 14.89
blazeface min = 2.14 max = 2.28 avg = 2.20
googlenet min = 9.91 max = 10.00 avg = 9.96
googlenet_int8 min = 11.51 max = 11.65 avg = 11.60
resnet18 min = 6.39 max = 6.56 avg = 6.46
resnet18_int8 min = 9.76 max = 9.91 avg = 9.84
alexnet min = 6.99 max = 7.10 avg = 7.04
vgg16 min = 27.52 max = 28.64 avg = 27.88
vgg16_int8 min = 45.64 max = 45.93 avg = 45.78
resnet50 min = 13.96 max = 14.17 avg = 14.07
resnet50_int8 min = 16.82 max = 16.93 avg = 16.89
squeezenet_ssd min = 11.11 max = 11.54 avg = 11.23
squeezenet_ssd_int8 min = 13.77 max = 14.00 avg = 13.88
mobilenet_ssd min = 8.21 max = 8.46 avg = 8.35
mobilenet_ssd_int8 min = 8.87 max = 9.03 avg = 8.94
mobilenet_yolo min = 30.77 max = 31.35 avg = 31.08
mobilenetv2_yolov3 min = 12.11 max = 13.10 avg = 12.43
yolov4-tiny min = 18.25 max = 18.68 avg = 18.41
nanodet_m min = 6.55 max = 6.68 avg = 6.59
yolo-fastest-1.1 min = 6.00 max = 6.22 avg = 6.09
yolo-fastestv2 min = 4.86 max = 5.01 avg = 4.94
vision_transformer min = 218.18 max = 220.49 avg = 218.79
FastestDet min = 5.01 max = 5.14 avg = 5.07
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 24 0 -1 0
loop_count = 10
num_threads = 24
powersave = 0
gpu_device = -1
cooling_down = 0
squeezenet min = 3.52 max = 3.96 avg = 3.70
squeezenet_int8 min = 5.49 max = 5.83 avg = 5.65
mobilenet min = 3.42 max = 3.83 avg = 3.55
mobilenet_int8 min = 3.69 max = 45.17 avg = 11.59
mobilenet_v2 min = 4.63 max = 5.44 avg = 4.84
mobilenet_v3 min = 4.51 max = 4.89 avg = 4.68
shufflenet min = 6.21 max = 6.52 avg = 6.36
shufflenet_v2 min = 3.98 max = 17.54 avg = 5.45
mnasnet min = 4.28 max = 4.56 avg = 4.39
proxylessnasnet min = 4.76 max = 5.13 avg = 4.92
efficientnet_b0 min = 7.45 max = 111.76 avg = 22.59
efficientnetv2_b0 min = 10.87 max = 33.13 avg = 13.51
regnety_400m min = 20.97 max = 21.73 avg = 21.46
blazeface min = 2.56 max = 2.82 avg = 2.67
googlenet min = 10.54 max = 105.87 avg = 21.85
googlenet_int8 min = 14.21 max = 77.02 avg = 22.23
resnet18 min = 7.08 max = 7.51 avg = 7.31
resnet18_int8 min = 11.25 max = 50.66 avg = 19.14
alexnet min = 7.13 max = 8.67 avg = 7.44
vgg16 min = 27.59 max = 35.35 avg = 29.12
vgg16_int8 min = 44.43 max = 51.76 avg = 46.90
resnet50 min = 15.16 max = 105.98 avg = 24.91
resnet50_int8 min = 19.82 max = 20.50 avg = 20.16
squeezenet_ssd min = 13.03 max = 13.69 avg = 13.40
squeezenet_ssd_int8 min = 17.62 max = 187.55 avg = 39.92
mobilenet_ssd min = 8.83 max = 71.97 avg = 15.37
mobilenet_ssd_int8 min = 10.22 max = 49.61 avg = 15.26
mobilenet_yolo min = 35.19 max = 46.43 avg = 36.93
mobilenetv2_yolov3 min = 12.96 max = 15.57 avg = 13.41
yolov4-tiny min = 19.22 max = 21.43 avg = 19.89
nanodet_m min = 7.71 max = 8.74 avg = 8.09
yolo-fastest-1.1 min = 6.71 max = 78.72 avg = 14.16
yolo-fastestv2 min = 5.72 max = 6.08 avg = 5.88
vision_transformer min = 192.16 max = 221.86 avg = 202.73
FastestDet min = 5.13 max = 5.47 avg = 5.30
```
### Intel Atom x5-Z8350
```
nihui@nihui-ROCK-Pi-X:~/ncnn/build/benchmark$ ./benchncnn 20 4 0 -1 1
......
......@@ -503,8 +503,13 @@ if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906)
endif()
if(NCNN_PPC64LE_VSX)
# Auto-translate SSE2 to VSX if compiler is new enough.
if(NCNN_VSX_SSE2)
target_compile_options(ncnn PRIVATE -DNO_WARN_X86_INTRINSICS -D__SSE2__)
endif()
# Auto-translate SSE4.1 to VSX if compiler is new enough.
if(NCNN_SSE41)
if(NCNN_VSX_SSE41)
target_compile_options(ncnn PRIVATE -DNO_WARN_X86_INTRINSICS -D__SSE4_1__)
endif()
endif()
......
......@@ -1878,9 +1878,7 @@ int VulkanDevicePrivate::create_dummy_buffer_image()
cmd.record_dummy_readonly(dummy_image_readonly);
#endif
cmd.submit_and_wait();
return 0;
return cmd.submit_and_wait();
}
void VulkanDevicePrivate::destroy_dummy_buffer_image()
......@@ -2289,7 +2287,11 @@ VulkanDevice::VulkanDevice(int device_index)
}
}
d->create_dummy_buffer_image();
int cret = d->create_dummy_buffer_image();
if (cret != 0)
{
NCNN_LOGE("VulkanDevice create_dummy_buffer_image failed %d", cret);
}
d->pipeline_cache = new PipelineCache(this);
......
......@@ -162,9 +162,7 @@ int NetPrivate::upload_model()
}
}
cmd.submit_and_wait();
return 0;
return cmd.submit_and_wait();
}
#endif // NCNN_VULKAN
......@@ -288,9 +286,10 @@ int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std:
}
}
int ret;
if (cmd_submit_and_wait)
{
cmd.submit_and_wait();
ret = cmd.submit_and_wait();
#if NCNN_BENCHMARK
std::vector<uint64_t> results(layer_index * 2);
......@@ -308,9 +307,10 @@ int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std:
#endif // NCNN_BENCHMARK
cmd.reset();
if (ret != 0)
return ret;
}
int ret;
if (layer->support_vulkan)
{
#if NCNN_BENCHMARK
......@@ -505,9 +505,10 @@ IMAGE_ALLOCATION_FAILED:
}
}
int ret;
if (cmd_submit_and_wait)
{
cmd.submit_and_wait();
ret = cmd.submit_and_wait();
#if NCNN_BENCHMARK
std::vector<uint64_t> results(layer_index * 2);
......@@ -525,9 +526,11 @@ IMAGE_ALLOCATION_FAILED:
#endif // NCNN_BENCHMARK
cmd.reset();
if (ret != 0)
return ret;
}
int ret;
if (layer->support_vulkan && !image_allocation_failed)
{
#if NCNN_BENCHMARK
......@@ -1827,9 +1830,9 @@ int Net::load_model(const DataReader& dr)
}
#if NCNN_VULKAN
if (opt.use_vulkan_compute)
if (ret == 0 && opt.use_vulkan_compute)
{
d->upload_model();
ret = d->upload_model();
}
#endif // NCNN_VULKAN
......@@ -2506,11 +2509,11 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
VkImageMat feat_gpu;
ret = extract(blob_index, feat_gpu, cmd);
if (d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
if (ret == 0 && d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
{
cmd.record_download(feat_gpu, d->blob_mats[blob_index], d->opt);
cmd.submit_and_wait();
ret = cmd.submit_and_wait();
#if NCNN_BENCHMARK
std::vector<uint64_t> results(d->net->layers().size() * 2);
......@@ -2533,11 +2536,11 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
VkMat feat_gpu;
ret = extract(blob_index, feat_gpu, cmd);
if (d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
if (ret == 0 && d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
{
cmd.record_download(feat_gpu, d->blob_mats[blob_index], d->opt);
cmd.submit_and_wait();
ret = cmd.submit_and_wait();
#if NCNN_BENCHMARK
std::vector<uint64_t> results(d->net->layers().size() * 2);
......
......@@ -8,8 +8,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_C_FLAGS "-target powerpc64le-linux-gnu -I/usr/powerpc64le-linux-gnu/include -mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSSE3__")
set(CMAKE_CXX_FLAGS "-target powerpc64le-linux-gnu -I/usr/powerpc64le-linux-gnu/include -I/usr/powerpc64le-linux-gnu/include/c++/10/powerpc64le-linux-gnu -mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSSE3__")
set(CMAKE_C_FLAGS "-target powerpc64le-linux-gnu -I/usr/powerpc64le-linux-gnu/include -mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")
set(CMAKE_CXX_FLAGS "-target powerpc64le-linux-gnu -I/usr/powerpc64le-linux-gnu/include -I/usr/powerpc64le-linux-gnu/include/c++/10/powerpc64le-linux-gnu -mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")
# cache flags
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
......
......@@ -8,8 +8,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_C_FLAGS "-mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSSE3__")
set(CMAKE_CXX_FLAGS "-mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSSE3__")
set(CMAKE_C_FLAGS "-mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")
set(CMAKE_CXX_FLAGS "-mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")
# cache flags
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
......
......@@ -313,6 +313,7 @@ set(pnnx_pass_level5_SRCS
pass_level5/eliminate_maxpool_indices.cpp
pass_level5/eliminate_noop_cat.cpp
pass_level5/eliminate_noop_einsum.cpp
pass_level5/eliminate_noop_expand.cpp
pass_level5/eliminate_noop_expression.cpp
pass_level5/eliminate_noop_pad.cpp
pass_level5/eliminate_noop_upsample.cpp
......
......@@ -47,8 +47,9 @@ public:
const torch::jit::Node* convolution = find_node_by_kind(graph, "aten::_convolution");
const torch::jit::Node* convolution_mode = find_node_by_kind(graph, "aten::_convolution_mode");
// const torch::jit::Node* reflection_pad3d = find_node_by_kind(graph, "aten::reflection_pad3d");
// const torch::jit::Node* replication_pad3d = find_node_by_kind(graph, "aten::replication_pad3d");
const torch::jit::Node* pad = find_node_by_kind(graph, "aten::pad");
const torch::jit::Node* reflection_pad3d = find_node_by_kind(graph, "aten::reflection_pad3d");
const torch::jit::Node* replication_pad3d = find_node_by_kind(graph, "aten::replication_pad3d");
if (convolution_mode)
{
......@@ -62,45 +63,64 @@ public:
op->params["out_channels"] = weight.size(0);
op->params["kernel_size"] = Parameter{weight.size(2), weight.size(3), weight.size(4)};
op->params["stride"] = convolution->namedInput("stride");
// if (reflection_pad3d)
// {
// op->params["padding_mode"] = "reflect";
// op->params["padding"] = reflection_pad3d->namedInput("padding");
// std::vector<int>& padding = op->params["padding"].ai;
// if (padding.size() == 6)
// {
// // Conv3d only accepts tuple of three integers
// if (padding[0] == padding[1] && padding[1] == padding[2] && padding[2] == padding[3] && padding[3] == padding[4] && padding[4] == padding[5])
// {
// padding.resize(3);
// }
// else if (padding[0] == padding[3] && padding[1] == padding[4] && padding[2] == padding[5] && padding[0] != padding[1] && padding[1] != padding[2])
// {
// padding.resize(0);
// op->params["padding"].s = "same";
// }
// }
// }
// else if (replication_pad3d)
// {
// op->params["padding_mode"] = "replicate";
// op->params["padding"] = replication_pad3d->namedInput("padding");
// std::vector<int>& padding = op->params["padding"].ai;
// if (padding.size() == 6)
// {
// // Conv3d only accepts tuple of three integers
// if (padding[0] == padding[1] && padding[1] == padding[2] && padding[2] == padding[3] && padding[3] == padding[4] && padding[4] == padding[5])
// {
// padding.resize(3);
// }
// else if (padding[0] == padding[3] && padding[1] == padding[4] && padding[2] == padding[5] && padding[0] != padding[1] && padding[1] != padding[2])
// {
// padding.resize(0);
// op->params["padding"].s = "same";
// }
// }
// }
// else
if (pad)
{
op->params["padding_mode"] = pad->namedInput("mode");
op->params["padding"] = pad->namedInput("pad");
std::vector<int>& padding = op->params["padding"].ai;
if (padding.size() == 6)
{
// Conv3d only accepts tuple of three integers
if (padding[0] == padding[1] && padding[1] == padding[2] && padding[2] == padding[3] && padding[3] == padding[4] && padding[4] == padding[5])
{
padding.resize(3);
}
else if (padding[0] == padding[3] && padding[1] == padding[4] && padding[2] == padding[5] && padding[0] != padding[1] && padding[1] != padding[2])
{
padding.resize(0);
op->params["padding"].s = "same";
}
}
}
else if (reflection_pad3d)
{
op->params["padding_mode"] = "reflect";
op->params["padding"] = reflection_pad3d->namedInput("padding");
std::vector<int>& padding = op->params["padding"].ai;
if (padding.size() == 6)
{
// Conv3d only accepts tuple of three integers
if (padding[0] == padding[1] && padding[1] == padding[2] && padding[2] == padding[3] && padding[3] == padding[4] && padding[4] == padding[5])
{
padding.resize(3);
}
else if (padding[0] == padding[3] && padding[1] == padding[4] && padding[2] == padding[5] && padding[0] != padding[1] && padding[1] != padding[2])
{
padding.resize(0);
op->params["padding"].s = "same";
}
}
}
else if (replication_pad3d)
{
op->params["padding_mode"] = "replicate";
op->params["padding"] = replication_pad3d->namedInput("padding");
std::vector<int>& padding = op->params["padding"].ai;
if (padding.size() == 6)
{
// Conv3d only accepts tuple of three integers
if (padding[0] == padding[1] && padding[1] == padding[2] && padding[2] == padding[3] && padding[3] == padding[4] && padding[4] == padding[5])
{
padding.resize(3);
}
else if (padding[0] == padding[3] && padding[1] == padding[4] && padding[2] == padding[5] && padding[0] != padding[1] && padding[1] != padding[2])
{
padding.resize(0);
op->params["padding"].s = "same";
}
}
}
else
{
op->params["padding_mode"] = "zeros";
op->params["padding"] = convolution->namedInput("padding");
......
......@@ -20,6 +20,7 @@
#include "pass_level5/eliminate_identity_operator.h"
#include "pass_level5/eliminate_noop_cat.h"
#include "pass_level5/eliminate_noop_einsum.h"
#include "pass_level5/eliminate_noop_expand.h"
#include "pass_level5/eliminate_noop_expression.h"
#include "pass_level5/eliminate_noop_pad.h"
#include "pass_level5/eliminate_noop_upsample.h"
......@@ -124,6 +125,7 @@ void pass_level5(Graph& g, const std::set<std::string>& foldable_constants, cons
eliminate_noop_view_reshape(g);
eliminate_reshape_shape_expression(g);
eliminate_noop_expand(g);
fuse_channel_shuffle(g);
fuse_layernorm(g);
......
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
#include "eliminate_noop_expand.h"
#include <algorithm>
#include "pass_level2.h"
namespace pnnx {
void eliminate_noop_expand(Graph& graph)
{
while (1)
{
bool matched = false;
for (size_t i = 0; i < graph.ops.size(); i++)
{
Operator* op = graph.ops[i];
if (op->type != "Tensor.expand_as" && op->type != "Tensor.expand")
continue;
Operand* expand_out = op->outputs[0];
bool all_consumers_are_expr = true;
for (auto& x : expand_out->consumers)
{
if (x->type != "pnnx.Expression")
{
all_consumers_are_expr = false;
break;
}
}
if (!all_consumers_are_expr)
continue;
// Tensor.expand_as expand 2 1 in b in2
// pnnx.Expression add 2 1 in2 b out
const std::vector<int>& inshape = op->inputs[0]->shape;
if (inshape.empty())
continue;
bool noop_expand = true;
for (auto& x : expand_out->consumers)
{
const std::vector<int>& outshape = x->outputs[0]->shape;
if (outshape.empty())
{
noop_expand = false;
break;
}
// check if inshape can be binary broadcast to outshape
if (inshape.size() != outshape.size())
{
noop_expand = false;
break;
}
for (size_t j = 0; j < inshape.size(); j++)
{
if ((inshape[j] == outshape[j] && outshape[j] != -1) || inshape[j] == 1 || outshape[j] == 1)
continue;
noop_expand = false;
break;
}
}
// check if our expand is the base shape
// so we do not drop expand for add(expand(x,shape),1.2)
for (auto& x : expand_out->consumers)
{
const std::vector<int>& outshape = x->outputs[0]->shape;
std::vector<int> broadcasted_shape = inshape;
for (const auto& r : x->inputs)
{
if (r == expand_out)
continue;
if (r->shape.size() != inshape.size())
continue;
for (size_t j = 0; j < broadcasted_shape.size(); j++)
{
broadcasted_shape[j] = std::max(broadcasted_shape[j], r->shape[j]);
}
}
if (broadcasted_shape != outshape)
{
noop_expand = false;
break;
}
}
if (!noop_expand)
continue;
// delete noop-like expand
matched = true;
for (auto& x : op->inputs)
{
x->remove_consumer(op);
}
for (auto& x : expand_out->consumers)
{
for (size_t j = 0; j < x->inputs.size(); j++)
{
if (x->inputs[j] == expand_out)
x->inputs[j] = op->inputs[0];
}
op->inputs[0]->consumers.push_back(x);
}
op->inputs[0]->name = expand_out->name;
expand_out->producer = 0;
expand_out->consumers.clear();
graph.operands.erase(std::find(graph.operands.begin(), graph.operands.end(), expand_out));
delete expand_out;
op->inputs.clear();
op->outputs.clear();
graph.ops.erase(graph.ops.begin() + i);
delete op;
break;
}
if (!matched)
break;
}
}
} // namespace pnnx
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
#include "ir.h"
namespace pnnx {
void eliminate_noop_expand(Graph& graph);
} // namespace pnnx
......@@ -26,7 +26,7 @@ public:
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv1d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
nn.Conv1d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
......@@ -43,12 +43,6 @@ pnnx.Output output 1 0 out
void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
{
std::string padding_mode = captured_params.at("padding_mode").s;
if (padding_mode != "zeros")
{
fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
}
op->params["0"] = captured_params.at("out_channels");
op->params["1"] = captured_params.at("kernel_size").ai[0];
op->params["2"] = captured_params.at("dilation").ai[0];
......@@ -83,7 +77,7 @@ public:
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv1d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
nn.Conv1d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
......@@ -100,12 +94,6 @@ pnnx.Output output 1 0 out
void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
{
std::string padding_mode = captured_params.at("padding_mode").s;
if (padding_mode != "zeros")
{
fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
}
op->params["0"] = captured_params.at("out_channels");
op->params["1"] = captured_params.at("kernel_size").ai[0];
op->params["2"] = captured_params.at("dilation").ai[0];
......@@ -133,8 +121,151 @@ pnnx.Output output 1 0 out
}
};
class nn_Conv1d_2 : public GraphRewriterPass
{
public:
const char* match_pattern_graph() const
{
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv1d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
const char* replace_pattern_graph() const
{
return R"PNNXIR(7767517
4 3
pnnx.Input input 0 1 input
Padding pad 1 1 input a
Convolution1D conv 1 1 a out
pnnx.Output output 1 0 out
)PNNXIR";
}
bool match(const std::map<std::string, Parameter>& captured_params) const
{
const std::string& padding_mode = captured_params.at("padding_mode").s;
if (padding_mode == "zeros")
return false;
return true;
}
bool match(const std::map<std::string, const Operator*>& matched_operators) const
{
const Operator* conv = matched_operators.at("op_0");
if (conv->params.at("padding").type == 4 && conv->params.at("padding").s == "same")
{
const std::vector<int> input_shape = conv->inputs[0]->shape;
if (input_shape.size() != 2 && input_shape.size() != 3)
{
fprintf(stderr, "can not resolve pads without shape\n");
return false;
}
}
return true;
}
void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
{
std::vector<int> padding;
if (captured_params.at("padding").type == 4)
{
if (captured_params.at("padding").s == "same")
{
// resolve pads
const std::vector<int> input_shape = ops.at("pad")->inputs[0]->shape;
const int w = input_shape[input_shape.size() - 1];
const int kernel_w = captured_params.at("kernel_size").ai[0];
const int dilation_w = captured_params.at("dilation").ai[0];
const int stride_w = captured_params.at("stride").ai[0];
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
padding = std::vector<int>{wpad / 2, wpad - wpad / 2};
}
else if (captured_params.at("padding").s == "valid")
{
padding = std::vector<int>{0, 0};
}
}
else
{
int wpad = captured_params.at("padding").ai[0];
padding = std::vector<int>{wpad, wpad};
}
ops.at("pad")->params["0"] = 0;
ops.at("pad")->params["1"] = 0;
ops.at("pad")->params["2"] = padding[0];
ops.at("pad")->params["3"] = padding[1];
std::string padding_mode = captured_params.at("padding_mode").s;
if (padding_mode == "reflect")
{
ops.at("pad")->params["4"] = 2; // type=reflect
}
else if (padding_mode == "replicate")
{
ops.at("pad")->params["4"] = 1; // type=replicate
}
else
{
fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
}
ops.at("conv")->params["0"] = captured_params.at("out_channels");
ops.at("conv")->params["1"] = captured_params.at("kernel_size").ai[0];
ops.at("conv")->params["2"] = captured_params.at("dilation").ai[0];
ops.at("conv")->params["3"] = captured_params.at("stride").ai[0];
ops.at("conv")->params["4"] = 0;
ops.at("conv")->params["5"] = captured_params.at("bias").b ? 1 : 0;
ops.at("conv")->params["6"] = captured_attrs.at("op_0.weight").elemcount();
ops.at("conv")->params["7"] = captured_params.find("groups") != captured_params.end() ? captured_params.at("groups") : 1;
ops.at("conv")->attrs["0"] = Attribute();
ops.at("conv")->attrs["0"].data = {0, 0, 0, 0};
ops.at("conv")->attrs["1"] = captured_attrs.at("op_0.weight");
if (captured_params.at("bias").b)
ops.at("conv")->attrs["2"] = captured_attrs.at("op_0.bias");
}
};
class nn_Conv1d_3 : public nn_Conv1d_2
{
public:
const char* match_pattern_graph() const
{
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv1d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
const char* replace_pattern_graph() const
{
return R"PNNXIR(7767517
4 3
pnnx.Input input 0 1 input
Padding pad 1 1 input a
ConvolutionDepthWise1D conv 1 1 a out
pnnx.Output output 1 0 out
)PNNXIR";
}
};
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv1d, 20)
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv1d_1, 21)
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv1d_2, 22)
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv1d_3, 23)
} // namespace ncnn
......
......@@ -26,7 +26,7 @@ public:
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv2d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
nn.Conv2d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
......@@ -43,12 +43,6 @@ pnnx.Output output 1 0 out
void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
{
std::string padding_mode = captured_params.at("padding_mode").s;
if (padding_mode != "zeros")
{
fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
}
op->params["0"] = captured_params.at("out_channels");
op->params["1"] = captured_params.at("kernel_size").ai[1];
op->params["11"] = captured_params.at("kernel_size").ai[0];
......@@ -87,7 +81,7 @@ public:
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv2d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
nn.Conv2d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
......@@ -104,12 +98,6 @@ pnnx.Output output 1 0 out
void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
{
std::string padding_mode = captured_params.at("padding_mode").s;
if (padding_mode != "zeros")
{
fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
}
op->params["0"] = captured_params.at("out_channels");
op->params["1"] = captured_params.at("kernel_size").ai[1];
op->params["11"] = captured_params.at("kernel_size").ai[0];
......@@ -141,8 +129,162 @@ pnnx.Output output 1 0 out
}
};
class nn_Conv2d_2 : public GraphRewriterPass
{
public:
const char* match_pattern_graph() const
{
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv2d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
const char* replace_pattern_graph() const
{
return R"PNNXIR(7767517
4 3
pnnx.Input input 0 1 input
Padding pad 1 1 input a
Convolution conv 1 1 a out
pnnx.Output output 1 0 out
)PNNXIR";
}
bool match(const std::map<std::string, Parameter>& captured_params) const
{
const std::string& padding_mode = captured_params.at("padding_mode").s;
if (padding_mode == "zeros")
return false;
return true;
}
bool match(const std::map<std::string, const Operator*>& matched_operators) const
{
const Operator* conv = matched_operators.at("op_0");
if (conv->params.at("padding").type == 4 && conv->params.at("padding").s == "same")
{
const std::vector<int> input_shape = conv->inputs[0]->shape;
if (input_shape.size() != 3 && input_shape.size() != 4)
{
fprintf(stderr, "can not resolve pads without shape\n");
return false;
}
}
return true;
}
void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
{
std::vector<int> padding;
if (captured_params.at("padding").type == 4)
{
if (captured_params.at("padding").s == "same")
{
// resolve pads
const std::vector<int> input_shape = ops.at("pad")->inputs[0]->shape;
const int w = input_shape[input_shape.size() - 1];
const int h = input_shape[input_shape.size() - 2];
const int kernel_w = captured_params.at("kernel_size").ai[1];
const int kernel_h = captured_params.at("kernel_size").ai[0];
const int dilation_w = captured_params.at("dilation").ai[1];
const int dilation_h = captured_params.at("dilation").ai[0];
const int stride_w = captured_params.at("stride").ai[1];
const int stride_h = captured_params.at("stride").ai[0];
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
padding = std::vector<int>{hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2};
}
else if (captured_params.at("padding").s == "valid")
{
padding = std::vector<int>{0, 0, 0, 0};
}
}
else
{
int hpad = captured_params.at("padding").ai[0];
int wpad = captured_params.at("padding").ai[1];
padding = std::vector<int>{hpad, hpad, wpad, wpad};
}
ops.at("pad")->params["0"] = padding[0];
ops.at("pad")->params["1"] = padding[1];
ops.at("pad")->params["2"] = padding[2];
ops.at("pad")->params["3"] = padding[3];
std::string padding_mode = captured_params.at("padding_mode").s;
if (padding_mode == "reflect")
{
ops.at("pad")->params["4"] = 2; // type=reflect
}
else if (padding_mode == "replicate")
{
ops.at("pad")->params["4"] = 1; // type=replicate
}
else
{
fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
}
ops.at("conv")->params["0"] = captured_params.at("out_channels");
ops.at("conv")->params["1"] = captured_params.at("kernel_size").ai[1];
ops.at("conv")->params["11"] = captured_params.at("kernel_size").ai[0];
ops.at("conv")->params["2"] = captured_params.at("dilation").ai[1];
ops.at("conv")->params["12"] = captured_params.at("dilation").ai[0];
ops.at("conv")->params["3"] = captured_params.at("stride").ai[1];
ops.at("conv")->params["13"] = captured_params.at("stride").ai[0];
ops.at("conv")->params["4"] = 0;
ops.at("conv")->params["14"] = 0;
ops.at("conv")->params["5"] = captured_params.at("bias").b ? 1 : 0;
ops.at("conv")->params["6"] = captured_attrs.at("op_0.weight").elemcount();
ops.at("conv")->params["7"] = captured_params.find("groups") != captured_params.end() ? captured_params.at("groups") : 1;
ops.at("conv")->attrs["0"] = Attribute();
ops.at("conv")->attrs["0"].data = {0, 0, 0, 0};
ops.at("conv")->attrs["1"] = captured_attrs.at("op_0.weight");
if (captured_params.at("bias").b)
ops.at("conv")->attrs["2"] = captured_attrs.at("op_0.bias");
}
};
class nn_Conv2d_3 : public nn_Conv2d_2
{
public:
const char* match_pattern_graph() const
{
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv2d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
const char* replace_pattern_graph() const
{
return R"PNNXIR(7767517
4 3
pnnx.Input input 0 1 input
Padding pad 1 1 input a
ConvolutionDepthWise conv 1 1 a out
pnnx.Output output 1 0 out
)PNNXIR";
}
};
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv2d, 20)
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv2d_1, 21)
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv2d_2, 22)
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv2d_3, 23)
} // namespace ncnn
......
......@@ -26,7 +26,7 @@ public:
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv3d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
nn.Conv3d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
......@@ -43,12 +43,6 @@ pnnx.Output output 1 0 out
void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
{
std::string padding_mode = captured_params.at("padding_mode").s;
if (padding_mode != "zeros")
{
fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
}
op->params["0"] = captured_params.at("out_channels");
op->params["1"] = captured_params.at("kernel_size").ai[2];
op->params["11"] = captured_params.at("kernel_size").ai[1];
......@@ -91,7 +85,7 @@ public:
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv3d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
nn.Conv3d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
......@@ -108,12 +102,6 @@ pnnx.Output output 1 0 out
void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
{
std::string padding_mode = captured_params.at("padding_mode").s;
if (padding_mode != "zeros")
{
fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
}
op->params["0"] = captured_params.at("out_channels");
op->params["1"] = captured_params.at("kernel_size").ai[2];
op->params["11"] = captured_params.at("kernel_size").ai[1];
......@@ -149,8 +137,175 @@ pnnx.Output output 1 0 out
}
};
class nn_Conv3d_2 : public GraphRewriterPass
{
public:
const char* match_pattern_graph() const
{
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv3d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
const char* replace_pattern_graph() const
{
return R"PNNXIR(7767517
4 3
pnnx.Input input 0 1 input
Padding pad 1 1 input a
Convolution3D conv 1 1 a out
pnnx.Output output 1 0 out
)PNNXIR";
}
bool match(const std::map<std::string, Parameter>& captured_params) const
{
const std::string& padding_mode = captured_params.at("padding_mode").s;
if (padding_mode == "zeros")
return false;
return true;
}
bool match(const std::map<std::string, const Operator*>& matched_operators) const
{
const Operator* conv = matched_operators.at("op_0");
if (conv->params.at("padding").type == 4 && conv->params.at("padding").s == "same")
{
const std::vector<int> input_shape = conv->inputs[0]->shape;
if (input_shape.size() != 4 && input_shape.size() != 5)
{
fprintf(stderr, "can not resolve pads without shape\n");
return false;
}
}
return true;
}
void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
{
std::vector<int> padding;
if (captured_params.at("padding").type == 4)
{
if (captured_params.at("padding").s == "same")
{
// resolve pads
const std::vector<int> input_shape = ops.at("pad")->inputs[0]->shape;
const int w = input_shape[input_shape.size() - 1];
const int h = input_shape[input_shape.size() - 2];
const int d = input_shape[input_shape.size() - 3];
const int kernel_w = captured_params.at("kernel_size").ai[2];
const int kernel_h = captured_params.at("kernel_size").ai[1];
const int kernel_d = captured_params.at("kernel_size").ai[0];
const int dilation_w = captured_params.at("dilation").ai[2];
const int dilation_h = captured_params.at("dilation").ai[1];
const int dilation_d = captured_params.at("dilation").ai[0];
const int stride_w = captured_params.at("stride").ai[2];
const int stride_h = captured_params.at("stride").ai[1];
const int stride_d = captured_params.at("stride").ai[0];
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
const int kernel_extent_d = dilation_d * (kernel_d - 1) + 1;
int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
int dpad = kernel_extent_d + (d - 1) / stride_d * stride_d - d;
padding = std::vector<int>{hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, dpad / 2, dpad - dpad / 2};
}
else if (captured_params.at("padding").s == "valid")
{
padding = std::vector<int>{0, 0, 0, 0, 0, 0};
}
}
else
{
int dpad = captured_params.at("padding").ai[0];
int hpad = captured_params.at("padding").ai[1];
int wpad = captured_params.at("padding").ai[2];
padding = std::vector<int>{hpad, hpad, wpad, wpad, dpad, dpad};
}
ops.at("pad")->params["0"] = padding[0];
ops.at("pad")->params["1"] = padding[1];
ops.at("pad")->params["2"] = padding[2];
ops.at("pad")->params["3"] = padding[3];
ops.at("pad")->params["7"] = padding[4];
ops.at("pad")->params["8"] = padding[5];
std::string padding_mode = captured_params.at("padding_mode").s;
if (padding_mode == "reflect")
{
ops.at("pad")->params["4"] = 2; // type=reflect
}
else if (padding_mode == "replicate")
{
ops.at("pad")->params["4"] = 1; // type=replicate
}
else
{
fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
}
ops.at("conv")->params["0"] = captured_params.at("out_channels");
ops.at("conv")->params["1"] = captured_params.at("kernel_size").ai[2];
ops.at("conv")->params["11"] = captured_params.at("kernel_size").ai[1];
ops.at("conv")->params["21"] = captured_params.at("kernel_size").ai[0];
ops.at("conv")->params["2"] = captured_params.at("dilation").ai[2];
ops.at("conv")->params["12"] = captured_params.at("dilation").ai[1];
ops.at("conv")->params["22"] = captured_params.at("dilation").ai[0];
ops.at("conv")->params["3"] = captured_params.at("stride").ai[2];
ops.at("conv")->params["13"] = captured_params.at("stride").ai[1];
ops.at("conv")->params["23"] = captured_params.at("stride").ai[0];
ops.at("conv")->params["4"] = 0;
ops.at("conv")->params["14"] = 0;
ops.at("conv")->params["24"] = 0;
ops.at("conv")->params["5"] = captured_params.at("bias").b ? 1 : 0;
ops.at("conv")->params["6"] = captured_attrs.at("op_0.weight").elemcount();
ops.at("conv")->params["7"] = captured_params.find("groups") != captured_params.end() ? captured_params.at("groups") : 1;
ops.at("conv")->attrs["0"] = Attribute();
ops.at("conv")->attrs["0"].data = {0, 0, 0, 0};
ops.at("conv")->attrs["1"] = captured_attrs.at("op_0.weight");
if (captured_params.at("bias").b)
ops.at("conv")->attrs["2"] = captured_attrs.at("op_0.bias");
}
};
class nn_Conv3d_3 : public nn_Conv3d_2
{
public:
const char* match_pattern_graph() const
{
return R"PNNXIR(7767517
3 2
pnnx.Input input 0 1 input
nn.Conv3d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
pnnx.Output output 1 0 out
)PNNXIR";
}
const char* replace_pattern_graph() const
{
return R"PNNXIR(7767517
4 3
pnnx.Input input 0 1 input
Padding pad 1 1 input a
ConvolutionDepthWise3D conv 1 1 a out
pnnx.Output output 1 0 out
)PNNXIR";
}
};
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv3d, 20)
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv3d_1, 21)
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv3d_2, 22)
REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv3d_3, 23)
} // namespace ncnn
......
......@@ -292,6 +292,7 @@ pnnx_add_test(vit_b_32)
#pnnx_add_test(quantization_shufflenet_v2_x1_0)
pnnx_add_test(pnnx_eliminate_noop_cat)
pnnx_add_test(pnnx_eliminate_noop_expand)
pnnx_add_test(pnnx_eliminate_noop_math)
pnnx_add_test(pnnx_eliminate_noop_upsample)
pnnx_add_test(pnnx_fold_constant)
......
......@@ -234,6 +234,9 @@ def test():
b = test_ncnn_numpy_binaryop_broadcast_ncnn.test_inference()
for a0, b0 in zip(a, b):
# allclose may auto broadcast compare
if a0.shape != b0.shape:
return False
if not torch.allclose(a0, b0, 1e-4, 1e-4):
return False
return True
......
......@@ -30,8 +30,8 @@ class Model(nn.Module):
else:
self.conv_3 = nn.Conv1d(in_channels=24, out_channels=28, kernel_size=5, stride=1, padding='valid', dilation=1, groups=4, bias=True)
self.conv_4 = nn.Conv1d(in_channels=28, out_channels=32, kernel_size=3, stride=1, padding='same', dilation=2, groups=2, bias=False, padding_mode='zeros')
#self.conv_5 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
#self.conv_6 = nn.Conv1d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
self.conv_5 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
self.conv_6 = nn.Conv1d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
def forward(self, x):
x = self.conv_0(x)
......@@ -39,8 +39,8 @@ class Model(nn.Module):
x = self.conv_2(x)
x = self.conv_3(x)
x = self.conv_4(x)
#x = self.conv_5(x)
#x = self.conv_6(x)
x = self.conv_5(x)
x = self.conv_6(x)
return x
......
......@@ -30,8 +30,8 @@ class Model(nn.Module):
else:
self.conv_3 = nn.Conv2d(in_channels=24, out_channels=28, kernel_size=(5,4), stride=1, padding='valid', dilation=1, groups=4, bias=True)
self.conv_4 = nn.Conv2d(in_channels=28, out_channels=32, kernel_size=3, stride=1, padding='same', dilation=(1,2), groups=2, bias=False, padding_mode='zeros')
#self.conv_5 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
#self.conv_6 = nn.Conv2d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
self.conv_5 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
self.conv_6 = nn.Conv2d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
def forward(self, x):
x = self.conv_0(x)
......@@ -39,8 +39,8 @@ class Model(nn.Module):
x = self.conv_2(x)
x = self.conv_3(x)
x = self.conv_4(x)
#x = self.conv_5(x)
#x = self.conv_6(x)
x = self.conv_5(x)
x = self.conv_6(x)
return x
......
......@@ -30,9 +30,10 @@ class Model(nn.Module):
else:
self.conv_3 = nn.Conv3d(in_channels=24, out_channels=28, kernel_size=(5,4,3), stride=1, padding='valid', dilation=1, groups=4, bias=True)
self.conv_4 = nn.Conv3d(in_channels=28, out_channels=32, kernel_size=3, stride=1, padding='same', dilation=(1,2,2), groups=2, bias=False, padding_mode='zeros')
#self.conv_5 = nn.Conv3d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
#self.conv_6 = nn.Conv3d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
#self.conv_7 = nn.Conv3d(in_channels=28, out_channels=24, kernel_size=3, stride=2, padding=(5,6), dilation=2, groups=1, bias=True, padding_mode='circular')
if version.parse(torch.__version__) >= version.parse('1.10'):
self.conv_5 = nn.Conv3d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
self.conv_6 = nn.Conv3d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
# self.conv_7 = nn.Conv3d(in_channels=28, out_channels=24, kernel_size=3, stride=2, padding=(5,6), dilation=2, groups=1, bias=True, padding_mode='circular')
def forward(self, x):
x = self.conv_0(x)
......@@ -40,8 +41,11 @@ class Model(nn.Module):
x = self.conv_2(x)
x = self.conv_3(x)
x = self.conv_4(x)
#x = self.conv_5(x)
#x = self.conv_6(x)
if version.parse(torch.__version__) < version.parse('1.10'):
return x
x = self.conv_5(x)
x = self.conv_6(x)
#x = self.conv_7(x)
return x
......
......@@ -30,9 +30,10 @@ class Model(nn.Module):
else:
self.conv_3 = nn.Conv3d(in_channels=24, out_channels=28, kernel_size=(5,4,3), stride=1, padding='valid', dilation=1, groups=4, bias=True)
self.conv_4 = nn.Conv3d(in_channels=28, out_channels=32, kernel_size=3, stride=1, padding='same', dilation=(1,2,2), groups=2, bias=False, padding_mode='zeros')
#self.conv_5 = nn.Conv3d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
#self.conv_6 = nn.Conv3d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
#self.conv_7 = nn.Conv3d(in_channels=28, out_channels=24, kernel_size=3, stride=2, padding=(5,6), dilation=2, groups=1, bias=True, padding_mode='circular')
if version.parse(torch.__version__) >= version.parse('1.10'):
self.conv_5 = nn.Conv3d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
self.conv_6 = nn.Conv3d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
# self.conv_7 = nn.Conv3d(in_channels=28, out_channels=24, kernel_size=3, stride=2, padding=(5,6), dilation=2, groups=1, bias=True, padding_mode='circular')
def forward(self, x):
x = self.conv_0(x)
......@@ -40,8 +41,11 @@ class Model(nn.Module):
x = self.conv_2(x)
x = self.conv_3(x)
x = self.conv_4(x)
#x = self.conv_5(x)
#x = self.conv_6(x)
if version.parse(torch.__version__) < version.parse('1.10'):
return x
x = self.conv_5(x)
x = self.conv_6(x)
#x = self.conv_7(x)
return x
......
# Tencent is pleased to support the open source community by making ncnn available.
#
# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
import torch
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
def forward(self, x0, x1, y0, y1, y2, y3, z0, z1, z2, z3, z4, z5, z6, z7, w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15):
return (x0 - x1.expand_as(x0), x1.expand(x0.size()) - x0,
y0 - y1.expand_as(y0), y1.expand(y0.size()) - y0,
y0 - y2.expand_as(y0), y2.expand(y0.size()) - y0,
y0 - y3.expand_as(y0), y3.expand(y0.size()) - y0,
y1 - y2.expand_as(y0), y2.expand(y0.size()) - y1,
y1 - y3.expand_as(y1), y3.expand(y1.size()) - y1,
y2 - y3.expand_as(y2), y3.expand(y2.size()) - y2,
z0 - z1.expand_as(z0), z1.expand(z0.size()) - z0,
z0 - z2.expand_as(z0), z2.expand(z0.size()) - z0,
z0 - z3.expand_as(z0), z3.expand(z0.size()) - z0,
z0 - z4.expand_as(z0), z4.expand(z0.size()) - z0,
z0 - z5.expand_as(z0), z5.expand(z0.size()) - z0,
z0 - z6.expand_as(z0), z6.expand(z0.size()) - z0,
z0 - z7.expand_as(z0), z7.expand(z0.size()) - z0,
z1 - z2.expand_as(z0), z2.expand(z0.size()) - z1,
z1 - z3.expand_as(z0), z3.expand(z0.size()) - z1,
z1 - z4.expand_as(z1), z4.expand(z1.size()) - z1,
z1 - z5.expand_as(z1), z5.expand(z1.size()) - z1,
z1 - z6.expand_as(z3), z6.expand(z3.size()) - z1,
z1 - z7.expand_as(z1), z7.expand(z1.size()) - z1,
z2 - z3.expand_as(z0), z3.expand(z0.size()) - z2,
z2 - z4.expand_as(z2), z4.expand(z2.size()) - z2,
z2 - z5.expand_as(z3), z5.expand(z3.size()) - z2,
z2 - z6.expand_as(z2), z6.expand(z2.size()) - z2,
z2 - z7.expand_as(z2), z7.expand(z2.size()) - z2,
z3 - z4.expand_as(z1), z4.expand(z1.size()) - z3,
z3 - z5.expand_as(z3), z5.expand(z3.size()) - z3,
z3 - z6.expand_as(z3), z6.expand(z3.size()) - z3,
z3 - z7.expand_as(z3), z7.expand(z3.size()) - z3,
z4 - z5.expand_as(z1), z5.expand(z1.size()) - z4,
z4 - z6.expand_as(z2), z6.expand(z2.size()) - z4,
z4 - z7.expand_as(z4), z7.expand(z4.size()) - z4,
z5 - z6.expand_as(z3), z6.expand(z3.size()) - z5,
z5 - z7.expand_as(z5), z7.expand(z5.size()) - z5,
z6 - z7.expand_as(z6), z7.expand(z6.size()) - z6,
w0 - w1.expand_as(w0), w1.expand(w0.size()) - w0,
w0 - w2.expand_as(w0), w2.expand(w0.size()) - w0,
w0 - w3.expand_as(w0), w3.expand(w0.size()) - w0,
w0 - w4.expand_as(w0), w4.expand(w0.size()) - w0,
w0 - w5.expand_as(w0), w5.expand(w0.size()) - w0,
w0 - w6.expand_as(w0), w6.expand(w0.size()) - w0,
w0 - w7.expand_as(w0), w7.expand(w0.size()) - w0,
w0 - w8.expand_as(w0), w8.expand(w0.size()) - w0,
w0 - w9.expand_as(w0), w9.expand(w0.size()) - w0,
w0 - w10.expand_as(w0), w10.expand(w0.size()) - w0,
w0 - w11.expand_as(w0), w11.expand(w0.size()) - w0,
w0 - w12.expand_as(w0), w12.expand(w0.size()) - w0,
w0 - w13.expand_as(w0), w13.expand(w0.size()) - w0,
w0 - w14.expand_as(w0), w14.expand(w0.size()) - w0,
w0 - w15.expand_as(w0), w15.expand(w0.size()) - w0,
w1 - w5.expand_as(w1), w5.expand(w1.size()) - w1,
w1 - w6.expand_as(w1), w6.expand(w1.size()) - w1,
w1 - w7.expand_as(w1), w7.expand(w1.size()) - w1,
w1 - w11.expand_as(w1), w11.expand(w1.size()) - w1,
w1 - w12.expand_as(w1), w12.expand(w1.size()) - w1,
w1 - w13.expand_as(w1), w13.expand(w1.size()) - w1,
w1 - w15.expand_as(w1), w15.expand(w1.size()) - w1,
w2 - w5.expand_as(w2), w5.expand(w2.size()) - w2,
w2 - w8.expand_as(w2), w8.expand(w2.size()) - w2,
w2 - w9.expand_as(w2), w9.expand(w2.size()) - w2,
w2 - w11.expand_as(w2), w11.expand(w2.size()) - w2,
w2 - w12.expand_as(w2), w12.expand(w2.size()) - w2,
w2 - w14.expand_as(w2), w14.expand(w2.size()) - w2,
w2 - w15.expand_as(w2), w15.expand(w2.size()) - w2,
w3 - w6.expand_as(w3), w6.expand(w3.size()) - w3,
w3 - w8.expand_as(w3), w8.expand(w3.size()) - w3,
w3 - w10.expand_as(w3), w10.expand(w3.size()) - w3,
w3 - w11.expand_as(w3), w11.expand(w3.size()) - w3,
w3 - w13.expand_as(w3), w13.expand(w3.size()) - w3,
w3 - w14.expand_as(w3), w14.expand(w3.size()) - w3,
w3 - w15.expand_as(w3), w15.expand(w3.size()) - w3,
w4 - w7.expand_as(w4), w7.expand(w4.size()) - w4,
w4 - w9.expand_as(w4), w9.expand(w4.size()) - w4,
w4 - w10.expand_as(w4), w10.expand(w4.size()) - w4,
w4 - w12.expand_as(w4), w12.expand(w4.size()) - w4,
w4 - w13.expand_as(w4), w13.expand(w4.size()) - w4,
w4 - w14.expand_as(w4), w14.expand(w4.size()) - w4,
w4 - w15.expand_as(w4), w15.expand(w4.size()) - w4,
w5 - w11.expand_as(w5), w11.expand(w5.size()) - w5,
w5 - w12.expand_as(w5), w12.expand(w5.size()) - w5,
w5 - w15.expand_as(w5), w15.expand(w5.size()) - w5,
w6 - w11.expand_as(w6), w11.expand(w6.size()) - w6,
w6 - w13.expand_as(w6), w13.expand(w6.size()) - w6,
w6 - w15.expand_as(w6), w15.expand(w6.size()) - w6,
w7 - w12.expand_as(w7), w12.expand(w7.size()) - w7,
w7 - w13.expand_as(w7), w13.expand(w7.size()) - w7,
w7 - w15.expand_as(w7), w15.expand(w7.size()) - w7,
w8 - w11.expand_as(w8), w11.expand(w8.size()) - w8,
w8 - w14.expand_as(w8), w14.expand(w8.size()) - w8,
w8 - w15.expand_as(w8), w15.expand(w8.size()) - w8,
w9 - w12.expand_as(w9), w12.expand(w9.size()) - w9,
w9 - w14.expand_as(w9), w14.expand(w9.size()) - w9,
w9 - w15.expand_as(w9), w15.expand(w9.size()) - w9,
w10 - w13.expand_as(w10), w13.expand(w10.size()) - w10,
w10 - w14.expand_as(w10), w14.expand(w10.size()) - w10,
w10 - w15.expand_as(w10), w15.expand(w10.size()) - w10,
w11 - w15.expand_as(w11), w15.expand(w11.size()) - w11,
w12 - w15.expand_as(w12), w15.expand(w12.size()) - w12,
w13 - w15.expand_as(w13), w15.expand(w13.size()) - w13,
w14 - w15.expand_as(w14), w15.expand(w14.size()) - w14,
# some negative cases
w11.expand_as(w5) - w14.expand_as(w10),
w5.expand(w1.size()) - w11,
w15.expand(6, 7, 8, 9) - w14
)
def test():
net = Model()
net.eval()
torch.manual_seed(0)
x0 = torch.rand(5)
x1 = torch.rand(1)
y0 = torch.rand(7, 5)
y1 = torch.rand(1, 5)
y2 = torch.rand(7, 1)
y3 = torch.rand(1, 1)
z0 = torch.rand(4, 7, 5)
z1 = torch.rand(1, 7, 5)
z2 = torch.rand(4, 1, 5)
z3 = torch.rand(4, 7, 1)
z4 = torch.rand(1, 1, 5)
z5 = torch.rand(1, 7, 1)
z6 = torch.rand(4, 1, 1)
z7 = torch.rand(1, 1, 1)
w0 = torch.rand(6, 4, 7, 5)
w1 = torch.rand(1, 4, 7, 5)
w2 = torch.rand(6, 1, 7, 5)
w3 = torch.rand(6, 4, 1, 5)
w4 = torch.rand(6, 4, 7, 1)
w5 = torch.rand(1, 1, 7, 5)
w6 = torch.rand(1, 4, 1, 5)
w7 = torch.rand(1, 4, 7, 1)
w8 = torch.rand(6, 1, 1, 5)
w9 = torch.rand(6, 1, 7, 1)
w10 = torch.rand(6, 4, 1, 1)
w11 = torch.rand(1, 1, 1, 5)
w12 = torch.rand(1, 1, 7, 1)
w13 = torch.rand(1, 4, 1, 1)
w14 = torch.rand(6, 1, 1, 1)
w15 = torch.rand(1, 1, 1, 1)
a = net(x0, x1, y0, y1, y2, y3, z0, z1, z2, z3, z4, z5, z6, z7, w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15)
# export torchscript
mod = torch.jit.trace(net, (x0, x1, y0, y1, y2, y3, z0, z1, z2, z3, z4, z5, z6, z7, w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15))
mod.save("test_pnnx_eliminate_noop_expand.pt")
# torchscript to pnnx
import os
os.system("../src/pnnx test_pnnx_eliminate_noop_expand.pt inputshape=[5],[1],[7,5],[1,5],[7,1],[1,1],[4,7,5],[1,7,5],[4,1,5],[4,7,1],[1,1,5],[1,7,1],[4,1,1],[1,1,1],[6,4,7,5],[1,4,7,5],[6,1,7,5],[6,4,1,5],[6,4,7,1],[1,1,7,5],[1,4,1,5],[1,4,7,1],[6,1,1,5],[6,1,7,1],[6,4,1,1],[1,1,1,5],[1,1,7,1],[1,4,1,1],[6,1,1,1],[1,1,1,1]")
# pnnx inference
import test_pnnx_eliminate_noop_expand_pnnx
b = test_pnnx_eliminate_noop_expand_pnnx.test_inference()
for a0, b0 in zip(a, b):
# allclose may auto broadcast compare
if a0.shape != b0.shape:
return False
if not torch.allclose(a0, b0, 1e-4, 1e-4):
return False
return True
if __name__ == "__main__":
if test():
exit(0)
else:
exit(1)