e8645e91 · e8645e91 · e8645e91 · e8645e91 · e8645e91 · e8645e91
23 changed file
--- a/.github/workflows/release-python.yml
+++ b/.github/workflows/release-python.yml
@@ -68,7 +68,7 @@ jobs:
        brew uninstall --ignore-dependencies libomp

    - name: Build wheels
-      uses: pypa/cibuildwheel@v2.13.0
+      uses: pypa/cibuildwheel@v2.13.1
      env:
        CIBW_ARCHS_MACOS: ${{ matrix.arch }}
        CIBW_ARCHS_LINUX: ${{ matrix.arch }}
@@ -122,7 +122,7 @@ jobs:
        platforms: all

    - name: Build wheels
-      uses: pypa/cibuildwheel@v2.13.0
+      uses: pypa/cibuildwheel@v2.13.1
      env:
        CIBW_ARCHS_LINUX: ${{ matrix.arch }}
        CIBW_BUILD: ${{ matrix.build }}

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -368,14 +368,24 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
    if(NCNN_PPC64LE_VSX)
        set(NCNN_TARGET_ARCH x86)

+        set(CMAKE_REQUIRED_FLAGS "-DNO_WARN_X86_INTRINSICS -D__SSE2__")
+        check_cxx_source_compiles("#include <emmintrin.h>\nint main() { return 0; }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE2)
+        unset(CMAKE_REQUIRED_FLAGS)
+
        set(CMAKE_REQUIRED_FLAGS "-DNO_WARN_X86_INTRINSICS -D__SSE4_1__")
        check_cxx_source_compiles("#include <smmintrin.h>\nint main() { __m128i _v, _a, _b; _v = _mm_packus_epi32(_a, _b); return 0; }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE41)
        unset(CMAKE_REQUIRED_FLAGS)

+        if(NCNN_COMPILER_SUPPORT_PPC64LE_SSE2)
+            option(NCNN_VSX_SSE2 "optimize ppc64le platform with sse2 extension" ON)
+        else()
+            message(WARNING "The compiler does not support sse2 extension. NCNN_VSX_SSE2 will be OFF.")
+        endif()
+
        if(NCNN_COMPILER_SUPPORT_PPC64LE_SSE41)
-            option(NCNN_SSE41 "optimize ppc64le platform with sse4.1 extension" ON)
+            option(NCNN_VSX_SSE41 "optimize ppc64le platform with sse4.1 extension" ON)
        else()
-            message(WARNING "The compiler does not support sse4.1 extension. NCNN_SSE41 will be OFF.")
+            message(WARNING "The compiler does not support sse4.1 extension. NCNN_VSX_SSE41 will be OFF.")
        endif()
    endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(xtensa)")

--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -4231,6 +4231,257 @@ cooling_down = 0
          FastestDet  min =    3.59  max =    3.61  avg =    3.60
 ```

+### HUAWEI KunPeng 920 3211K (x24 cores)
+test on ubuntu 22.04
+```
+(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 1 0 -1 0
+loop_count = 10
+num_threads = 1
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =   12.11  max =   12.20  avg =   12.14
+     squeezenet_int8  min =   14.34  max =   14.46  avg =   14.41
+           mobilenet  min =   20.27  max =   20.36  avg =   20.31
+      mobilenet_int8  min =   17.45  max =   17.74  avg =   17.58
+        mobilenet_v2  min =   13.72  max =   13.87  avg =   13.78
+        mobilenet_v3  min =   11.51  max =   11.69  avg =   11.61
+          shufflenet  min =    8.07  max =    8.36  avg =    8.20
+       shufflenet_v2  min =    8.13  max =    8.17  avg =    8.14
+             mnasnet  min =   13.34  max =   13.45  avg =   13.41
+     proxylessnasnet  min =   16.22  max =   16.35  avg =   16.29
+     efficientnet_b0  min =   34.69  max =   35.14  avg =   34.82
+   efficientnetv2_b0  min =   44.54  max =   44.68  avg =   44.61
+        regnety_400m  min =   18.06  max =   18.15  avg =   18.10
+           blazeface  min =    3.06  max =    3.22  avg =    3.12
+           googlenet  min =   56.80  max =   57.60  avg =   57.08
+      googlenet_int8  min =   58.64  max =   59.98  avg =   59.42
+            resnet18  min =   35.02  max =   35.35  avg =   35.10
+       resnet18_int8  min =   61.13  max =   61.68  avg =   61.33
+             alexnet  min =   42.56  max =   43.05  avg =   42.69
+               vgg16  min =  186.32  max =  188.73  avg =  187.20
+          vgg16_int8  min =  459.01  max =  461.48  avg =  460.29
+            resnet50  min =   97.59  max =   98.32  avg =   97.83
+       resnet50_int8  min =  118.67  max =  120.45  avg =  119.78
+      squeezenet_ssd  min =   39.62  max =   39.95  avg =   39.81
+ squeezenet_ssd_int8  min =   56.72  max =   57.63  avg =   57.00
+       mobilenet_ssd  min =   45.44  max =   45.82  avg =   45.63
+  mobilenet_ssd_int8  min =   38.99  max =   40.08  avg =   39.39
+      mobilenet_yolo  min =   98.71  max =   99.27  avg =   98.94
+  mobilenetv2_yolov3  min =   51.50  max =   52.41  avg =   51.87
+         yolov4-tiny  min =   68.02  max =   68.43  avg =   68.24
+           nanodet_m  min =   20.49  max =   20.64  avg =   20.59
+    yolo-fastest-1.1  min =    8.17  max =    8.45  avg =    8.23
+      yolo-fastestv2  min =    7.73  max =    8.06  avg =    7.87
+  vision_transformer  min = 1620.65  max = 1630.45  avg = 1625.64
+          FastestDet  min =    7.65  max =    7.77  avg =    7.69
+(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 2 0 -1 0
+loop_count = 10
+num_threads = 2
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    6.77  max =    6.85  avg =    6.81
+     squeezenet_int8  min =    7.98  max =    8.07  avg =    8.03
+           mobilenet  min =   10.70  max =   10.78  avg =   10.73
+      mobilenet_int8  min =    9.21  max =    9.36  avg =    9.28
+        mobilenet_v2  min =    7.91  max =    7.99  avg =    7.94
+        mobilenet_v3  min =    6.72  max =    6.92  avg =    6.78
+          shufflenet  min =    5.34  max =    5.55  avg =    5.38
+       shufflenet_v2  min =    5.12  max =    5.15  avg =    5.14
+             mnasnet  min =    7.74  max =    7.86  avg =    7.80
+     proxylessnasnet  min =    9.00  max =    9.03  avg =    9.02
+     efficientnet_b0  min =   18.51  max =   18.58  avg =   18.54
+   efficientnetv2_b0  min =   23.68  max =   23.83  avg =   23.74
+        regnety_400m  min =   12.65  max =   12.68  avg =   12.66
+           blazeface  min =    1.99  max =    2.14  avg =    2.03
+           googlenet  min =   30.83  max =   31.29  avg =   30.91
+      googlenet_int8  min =   31.97  max =   33.12  avg =   32.45
+            resnet18  min =   18.81  max =   18.87  avg =   18.84
+       resnet18_int8  min =   32.80  max =   32.99  avg =   32.90
+             alexnet  min =   22.88  max =   23.16  avg =   22.94
+               vgg16  min =  100.58  max =  101.12  avg =  100.90
+          vgg16_int8  min =  235.81  max =  237.97  avg =  236.20
+            resnet50  min =   51.12  max =   51.43  avg =   51.28
+       resnet50_int8  min =   62.46  max =   63.02  avg =   62.72
+      squeezenet_ssd  min =   23.26  max =   23.73  avg =   23.38
+ squeezenet_ssd_int8  min =   31.91  max =   32.30  avg =   32.13
+       mobilenet_ssd  min =   24.73  max =   24.95  avg =   24.84
+  mobilenet_ssd_int8  min =   20.99  max =   21.52  avg =   21.21
+      mobilenet_yolo  min =   54.91  max =   55.70  avg =   55.15
+  mobilenetv2_yolov3  min =   30.18  max =   30.52  avg =   30.31
+         yolov4-tiny  min =   40.46  max =   40.61  avg =   40.55
+           nanodet_m  min =   12.56  max =   12.72  avg =   12.62
+    yolo-fastest-1.1  min =    6.00  max =    6.15  avg =    6.04
+      yolo-fastestv2  min =    5.32  max =    5.59  avg =    5.43
+  vision_transformer  min =  894.51  max =  896.28  avg =  895.57
+          FastestDet  min =    5.33  max =    5.42  avg =    5.36
+(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 4 0 -1 0
+loop_count = 10
+num_threads = 4
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    4.18  max =    4.35  avg =    4.22
+     squeezenet_int8  min =    4.85  max =    4.98  avg =    4.89
+           mobilenet  min =    5.80  max =    5.95  avg =    5.89
+      mobilenet_int8  min =    4.86  max =    4.94  avg =    4.89
+        mobilenet_v2  min =    4.66  max =    4.73  avg =    4.69
+        mobilenet_v3  min =    4.46  max =    4.50  avg =    4.48
+          shufflenet  min =    4.01  max =    4.17  avg =    4.04
+       shufflenet_v2  min =    3.39  max =    3.41  avg =    3.39
+             mnasnet  min =    4.81  max =    4.93  avg =    4.85
+     proxylessnasnet  min =    5.47  max =    5.54  avg =    5.49
+     efficientnet_b0  min =   10.49  max =   10.55  avg =   10.52
+   efficientnetv2_b0  min =   13.67  max =   13.77  avg =   13.72
+        regnety_400m  min =   10.20  max =   10.24  avg =   10.21
+           blazeface  min =    1.52  max =    1.58  avg =    1.54
+           googlenet  min =   17.65  max =   17.69  avg =   17.68
+      googlenet_int8  min =   18.14  max =   18.27  avg =   18.19
+            resnet18  min =   10.52  max =   10.63  avg =   10.57
+       resnet18_int8  min =   17.42  max =   17.53  avg =   17.49
+             alexnet  min =   13.12  max =   13.20  avg =   13.16
+               vgg16  min =   55.24  max =   55.45  avg =   55.35
+          vgg16_int8  min =  123.46  max =  124.23  avg =  123.75
+            resnet50  min =   28.31  max =   28.57  avg =   28.39
+       resnet50_int8  min =   34.10  max =   34.39  avg =   34.23
+      squeezenet_ssd  min =   14.85  max =   14.96  avg =   14.91
+ squeezenet_ssd_int8  min =   19.71  max =   19.88  avg =   19.82
+       mobilenet_ssd  min =   13.49  max =   13.58  avg =   13.52
+  mobilenet_ssd_int8  min =   11.60  max =   11.70  avg =   11.66
+      mobilenet_yolo  min =   31.74  max =   31.96  avg =   31.81
+  mobilenetv2_yolov3  min =   17.87  max =   18.03  avg =   17.93
+         yolov4-tiny  min =   25.63  max =   25.78  avg =   25.72
+           nanodet_m  min =    8.16  max =    8.22  avg =    8.20
+    yolo-fastest-1.1  min =    4.72  max =    4.86  avg =    4.75
+      yolo-fastestv2  min =    3.98  max =    4.15  avg =    4.00
+  vision_transformer  min =  501.18  max =  503.51  avg =  502.12
+          FastestDet  min =    3.74  max =    3.76  avg =    3.75
+(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 8 0 -1 0
+loop_count = 10
+num_threads = 8
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    2.91  max =    3.10  avg =    2.97
+     squeezenet_int8  min =    3.42  max =    3.74  avg =    3.51
+           mobilenet  min =    3.57  max =    3.70  avg =    3.61
+      mobilenet_int8  min =    3.06  max =    3.14  avg =    3.10
+        mobilenet_v2  min =    3.73  max =    3.75  avg =    3.75
+        mobilenet_v3  min =    3.50  max =    3.66  avg =    3.56
+          shufflenet  min =    3.63  max =    3.65  avg =    3.64
+       shufflenet_v2  min =    2.85  max =    3.02  avg =    2.95
+             mnasnet  min =    3.60  max =    3.67  avg =    3.62
+     proxylessnasnet  min =    4.00  max =    4.08  avg =    4.03
+     efficientnet_b0  min =    7.31  max =    7.34  avg =    7.33
+   efficientnetv2_b0  min =    9.44  max =    9.51  avg =    9.47
+        regnety_400m  min =    9.76  max =   10.07  avg =    9.90
+           blazeface  min =    1.56  max =    1.75  avg =    1.61
+           googlenet  min =   11.22  max =   11.28  avg =   11.25
+      googlenet_int8  min =   11.40  max =   12.82  avg =   11.76
+            resnet18  min =    6.83  max =    6.96  avg =    6.90
+       resnet18_int8  min =   10.28  max =   10.38  avg =   10.33
+             alexnet  min =    8.75  max =    8.88  avg =    8.80
+               vgg16  min =   36.00  max =   36.72  avg =   36.29
+          vgg16_int8  min =   67.38  max =   67.72  avg =   67.54
+            resnet50  min =   17.63  max =   17.82  avg =   17.68
+       resnet50_int8  min =   20.05  max =   20.21  avg =   20.15
+      squeezenet_ssd  min =   11.18  max =   11.45  avg =   11.26
+ squeezenet_ssd_int8  min =   14.09  max =   14.23  avg =   14.18
+       mobilenet_ssd  min =    8.60  max =    8.69  avg =    8.64
+  mobilenet_ssd_int8  min =    7.75  max =    7.87  avg =    7.81
+      mobilenet_yolo  min =   21.97  max =   22.25  avg =   22.09
+  mobilenetv2_yolov3  min =   14.04  max =   14.18  avg =   14.12
+         yolov4-tiny  min =   19.66  max =   19.93  avg =   19.81
+           nanodet_m  min =    6.52  max =    6.67  avg =    6.57
+    yolo-fastest-1.1  min =    4.61  max =    4.76  avg =    4.66
+      yolo-fastestv2  min =    3.78  max =    3.91  avg =    3.82
+  vision_transformer  min =  323.01  max =  327.38  avg =  323.75
+          FastestDet  min =    3.50  max =    3.54  avg =    3.51
+(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 16 0 -1 0
+loop_count = 10
+num_threads = 16
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    3.00  max =    3.25  avg =    3.08
+     squeezenet_int8  min =    4.13  max =    4.47  avg =    4.21
+           mobilenet  min =    3.27  max =    3.42  avg =    3.34
+      mobilenet_int8  min =    3.49  max =    3.58  avg =    3.56
+        mobilenet_v2  min =    3.86  max =    4.10  avg =    3.97
+        mobilenet_v3  min =    3.72  max =    3.80  avg =    3.76
+          shufflenet  min =    4.67  max =    4.78  avg =    4.72
+       shufflenet_v2  min =    3.16  max =    3.24  avg =    3.20
+             mnasnet  min =    3.51  max =    3.65  avg =    3.57
+     proxylessnasnet  min =    4.08  max =    4.35  avg =    4.15
+     efficientnet_b0  min =    7.51  max =    7.80  avg =    7.63
+   efficientnetv2_b0  min =    8.92  max =    9.39  avg =    9.05
+        regnety_400m  min =   14.80  max =   15.05  avg =   14.89
+           blazeface  min =    2.14  max =    2.28  avg =    2.20
+           googlenet  min =    9.91  max =   10.00  avg =    9.96
+      googlenet_int8  min =   11.51  max =   11.65  avg =   11.60
+            resnet18  min =    6.39  max =    6.56  avg =    6.46
+       resnet18_int8  min =    9.76  max =    9.91  avg =    9.84
+             alexnet  min =    6.99  max =    7.10  avg =    7.04
+               vgg16  min =   27.52  max =   28.64  avg =   27.88
+          vgg16_int8  min =   45.64  max =   45.93  avg =   45.78
+            resnet50  min =   13.96  max =   14.17  avg =   14.07
+       resnet50_int8  min =   16.82  max =   16.93  avg =   16.89
+      squeezenet_ssd  min =   11.11  max =   11.54  avg =   11.23
+ squeezenet_ssd_int8  min =   13.77  max =   14.00  avg =   13.88
+       mobilenet_ssd  min =    8.21  max =    8.46  avg =    8.35
+  mobilenet_ssd_int8  min =    8.87  max =    9.03  avg =    8.94
+      mobilenet_yolo  min =   30.77  max =   31.35  avg =   31.08
+  mobilenetv2_yolov3  min =   12.11  max =   13.10  avg =   12.43
+         yolov4-tiny  min =   18.25  max =   18.68  avg =   18.41
+           nanodet_m  min =    6.55  max =    6.68  avg =    6.59
+    yolo-fastest-1.1  min =    6.00  max =    6.22  avg =    6.09
+      yolo-fastestv2  min =    4.86  max =    5.01  avg =    4.94
+  vision_transformer  min =  218.18  max =  220.49  avg =  218.79
+          FastestDet  min =    5.01  max =    5.14  avg =    5.07
+(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 24 0 -1 0
+loop_count = 10
+num_threads = 24
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    3.52  max =    3.96  avg =    3.70
+     squeezenet_int8  min =    5.49  max =    5.83  avg =    5.65
+           mobilenet  min =    3.42  max =    3.83  avg =    3.55
+      mobilenet_int8  min =    3.69  max =   45.17  avg =   11.59
+        mobilenet_v2  min =    4.63  max =    5.44  avg =    4.84
+        mobilenet_v3  min =    4.51  max =    4.89  avg =    4.68
+          shufflenet  min =    6.21  max =    6.52  avg =    6.36
+       shufflenet_v2  min =    3.98  max =   17.54  avg =    5.45
+             mnasnet  min =    4.28  max =    4.56  avg =    4.39
+     proxylessnasnet  min =    4.76  max =    5.13  avg =    4.92
+     efficientnet_b0  min =    7.45  max =  111.76  avg =   22.59
+   efficientnetv2_b0  min =   10.87  max =   33.13  avg =   13.51
+        regnety_400m  min =   20.97  max =   21.73  avg =   21.46
+           blazeface  min =    2.56  max =    2.82  avg =    2.67
+           googlenet  min =   10.54  max =  105.87  avg =   21.85
+      googlenet_int8  min =   14.21  max =   77.02  avg =   22.23
+            resnet18  min =    7.08  max =    7.51  avg =    7.31
+       resnet18_int8  min =   11.25  max =   50.66  avg =   19.14
+             alexnet  min =    7.13  max =    8.67  avg =    7.44
+               vgg16  min =   27.59  max =   35.35  avg =   29.12
+          vgg16_int8  min =   44.43  max =   51.76  avg =   46.90
+            resnet50  min =   15.16  max =  105.98  avg =   24.91
+       resnet50_int8  min =   19.82  max =   20.50  avg =   20.16
+      squeezenet_ssd  min =   13.03  max =   13.69  avg =   13.40
+ squeezenet_ssd_int8  min =   17.62  max =  187.55  avg =   39.92
+       mobilenet_ssd  min =    8.83  max =   71.97  avg =   15.37
+  mobilenet_ssd_int8  min =   10.22  max =   49.61  avg =   15.26
+      mobilenet_yolo  min =   35.19  max =   46.43  avg =   36.93
+  mobilenetv2_yolov3  min =   12.96  max =   15.57  avg =   13.41
+         yolov4-tiny  min =   19.22  max =   21.43  avg =   19.89
+           nanodet_m  min =    7.71  max =    8.74  avg =    8.09
+    yolo-fastest-1.1  min =    6.71  max =   78.72  avg =   14.16
+      yolo-fastestv2  min =    5.72  max =    6.08  avg =    5.88
+  vision_transformer  min =  192.16  max =  221.86  avg =  202.73
+          FastestDet  min =    5.13  max =    5.47  avg =    5.30
+```
+
 ### Intel Atom x5-Z8350
 ```
 nihui@nihui-ROCK-Pi-X:~/ncnn/build/benchmark$ ./benchncnn 20 4 0 -1 1

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -503,8 +503,13 @@ if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906)
 endif()

 if(NCNN_PPC64LE_VSX)
+    # Auto-translate SSE2 to VSX if compiler is new enough.
+    if(NCNN_VSX_SSE2)
+        target_compile_options(ncnn PRIVATE -DNO_WARN_X86_INTRINSICS -D__SSE2__)
+    endif()
+
    # Auto-translate SSE4.1 to VSX if compiler is new enough.
-    if(NCNN_SSE41)
+    if(NCNN_VSX_SSE41)
        target_compile_options(ncnn PRIVATE -DNO_WARN_X86_INTRINSICS -D__SSE4_1__)
    endif()
 endif()

--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -1878,9 +1878,7 @@ int VulkanDevicePrivate::create_dummy_buffer_image()
    cmd.record_dummy_readonly(dummy_image_readonly);
 #endif

-    cmd.submit_and_wait();
-
-    return 0;
+    return cmd.submit_and_wait();
 }

 void VulkanDevicePrivate::destroy_dummy_buffer_image()
@@ -2289,7 +2287,11 @@ VulkanDevice::VulkanDevice(int device_index)
        }
    }

-    d->create_dummy_buffer_image();
+    int cret = d->create_dummy_buffer_image();
+    if (cret != 0)
+    {
+        NCNN_LOGE("VulkanDevice create_dummy_buffer_image failed %d", cret);
+    }

    d->pipeline_cache = new PipelineCache(this);


--- a/src/net.cpp
+++ b/src/net.cpp
@@ -162,9 +162,7 @@ int NetPrivate::upload_model()
        }
    }

-    cmd.submit_and_wait();
-
-    return 0;
+    return cmd.submit_and_wait();
 }
 #endif // NCNN_VULKAN

@@ -288,9 +286,10 @@ int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std:
        }
    }

+    int ret;
    if (cmd_submit_and_wait)
    {
-        cmd.submit_and_wait();
+        ret = cmd.submit_and_wait();

 #if NCNN_BENCHMARK
        std::vector<uint64_t> results(layer_index * 2);
@@ -308,9 +307,10 @@ int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std:
 #endif // NCNN_BENCHMARK

        cmd.reset();
+        if (ret != 0)
+            return ret;
    }

-    int ret;
    if (layer->support_vulkan)
    {
 #if NCNN_BENCHMARK
@@ -505,9 +505,10 @@ IMAGE_ALLOCATION_FAILED:
        }
    }

+    int ret;
    if (cmd_submit_and_wait)
    {
-        cmd.submit_and_wait();
+        ret = cmd.submit_and_wait();

 #if NCNN_BENCHMARK
        std::vector<uint64_t> results(layer_index * 2);
@@ -525,9 +526,11 @@ IMAGE_ALLOCATION_FAILED:
 #endif // NCNN_BENCHMARK

        cmd.reset();
+
+        if (ret != 0)
+            return ret;
    }

-    int ret;
    if (layer->support_vulkan && !image_allocation_failed)
    {
 #if NCNN_BENCHMARK
@@ -1827,9 +1830,9 @@ int Net::load_model(const DataReader& dr)
    }

 #if NCNN_VULKAN
-    if (opt.use_vulkan_compute)
+    if (ret == 0 && opt.use_vulkan_compute)
    {
-        d->upload_model();
+        ret = d->upload_model();
    }
 #endif // NCNN_VULKAN

@@ -2506,11 +2509,11 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
                VkImageMat feat_gpu;
                ret = extract(blob_index, feat_gpu, cmd);

-                if (d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
+                if (ret == 0 && d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
                {
                    cmd.record_download(feat_gpu, d->blob_mats[blob_index], d->opt);

-                    cmd.submit_and_wait();
+                    ret = cmd.submit_and_wait();

 #if NCNN_BENCHMARK
                    std::vector<uint64_t> results(d->net->layers().size() * 2);
@@ -2533,11 +2536,11 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
                VkMat feat_gpu;
                ret = extract(blob_index, feat_gpu, cmd);

-                if (d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
+                if (ret == 0 && d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
                {
                    cmd.record_download(feat_gpu, d->blob_mats[blob_index], d->opt);

-                    cmd.submit_and_wait();
+                    ret = cmd.submit_and_wait();

 #if NCNN_BENCHMARK
                    std::vector<uint64_t> results(d->net->layers().size() * 2);

--- a/toolchains/power9le-linux-gnu-vsx.clang.toolchain.cmake
+++ b/toolchains/power9le-linux-gnu-vsx.clang.toolchain.cmake
@@ -8,8 +8,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)

-set(CMAKE_C_FLAGS "-target powerpc64le-linux-gnu -I/usr/powerpc64le-linux-gnu/include -mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSSE3__")
-set(CMAKE_CXX_FLAGS "-target powerpc64le-linux-gnu -I/usr/powerpc64le-linux-gnu/include -I/usr/powerpc64le-linux-gnu/include/c++/10/powerpc64le-linux-gnu -mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSSE3__")
+set(CMAKE_C_FLAGS "-target powerpc64le-linux-gnu -I/usr/powerpc64le-linux-gnu/include -mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")
+set(CMAKE_CXX_FLAGS "-target powerpc64le-linux-gnu -I/usr/powerpc64le-linux-gnu/include -I/usr/powerpc64le-linux-gnu/include/c++/10/powerpc64le-linux-gnu -mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")

 # cache flags
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")

--- a/toolchains/power9le-linux-gnu-vsx.toolchain.cmake
+++ b/toolchains/power9le-linux-gnu-vsx.toolchain.cmake
@@ -8,8 +8,8 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)

-set(CMAKE_C_FLAGS "-mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSSE3__")
-set(CMAKE_CXX_FLAGS "-mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSSE3__")
+set(CMAKE_C_FLAGS "-mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")
+set(CMAKE_CXX_FLAGS "-mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")

 # cache flags
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")

--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -313,6 +313,7 @@ set(pnnx_pass_level5_SRCS
    pass_level5/eliminate_maxpool_indices.cpp
    pass_level5/eliminate_noop_cat.cpp
    pass_level5/eliminate_noop_einsum.cpp
+    pass_level5/eliminate_noop_expand.cpp
    pass_level5/eliminate_noop_expression.cpp
    pass_level5/eliminate_noop_pad.cpp
    pass_level5/eliminate_noop_upsample.cpp

--- a/tools/pnnx/src/pass_level1/nn_Conv3d.cpp
+++ b/tools/pnnx/src/pass_level1/nn_Conv3d.cpp
@@ -47,8 +47,9 @@ public:

        const torch::jit::Node* convolution = find_node_by_kind(graph, "aten::_convolution");
        const torch::jit::Node* convolution_mode = find_node_by_kind(graph, "aten::_convolution_mode");
-        //         const torch::jit::Node* reflection_pad3d = find_node_by_kind(graph, "aten::reflection_pad3d");
-        //         const torch::jit::Node* replication_pad3d = find_node_by_kind(graph, "aten::replication_pad3d");
+        const torch::jit::Node* pad = find_node_by_kind(graph, "aten::pad");
+        const torch::jit::Node* reflection_pad3d = find_node_by_kind(graph, "aten::reflection_pad3d");
+        const torch::jit::Node* replication_pad3d = find_node_by_kind(graph, "aten::replication_pad3d");

        if (convolution_mode)
        {
@@ -62,45 +63,64 @@ public:
        op->params["out_channels"] = weight.size(0);
        op->params["kernel_size"] = Parameter{weight.size(2), weight.size(3), weight.size(4)};
        op->params["stride"] = convolution->namedInput("stride");
-        //         if (reflection_pad3d)
-        //         {
-        //             op->params["padding_mode"] = "reflect";
-        //             op->params["padding"] = reflection_pad3d->namedInput("padding");
-        //             std::vector<int>& padding = op->params["padding"].ai;
-        //             if (padding.size() == 6)
-        //             {
-        //                 // Conv3d only accepts tuple of three integers
-        //                 if (padding[0] == padding[1] && padding[1] == padding[2] && padding[2] == padding[3] && padding[3] == padding[4] && padding[4] == padding[5])
-        //                 {
-        //                     padding.resize(3);
-        //                 }
-        //                 else if (padding[0] == padding[3] && padding[1] == padding[4] && padding[2] == padding[5] && padding[0] != padding[1] && padding[1] != padding[2])
-        //                 {
-        //                     padding.resize(0);
-        //                     op->params["padding"].s = "same";
-        //                 }
-        //             }
-        //         }
-        //         else if (replication_pad3d)
-        //         {
-        //             op->params["padding_mode"] = "replicate";
-        //             op->params["padding"] = replication_pad3d->namedInput("padding");
-        //             std::vector<int>& padding = op->params["padding"].ai;
-        //             if (padding.size() == 6)
-        //             {
-        //                 // Conv3d only accepts tuple of three integers
-        //                 if (padding[0] == padding[1] && padding[1] == padding[2] && padding[2] == padding[3] && padding[3] == padding[4] && padding[4] == padding[5])
-        //                 {
-        //                     padding.resize(3);
-        //                 }
-        //                 else if (padding[0] == padding[3] && padding[1] == padding[4] && padding[2] == padding[5] && padding[0] != padding[1] && padding[1] != padding[2])
-        //                 {
-        //                     padding.resize(0);
-        //                     op->params["padding"].s = "same";
-        //                 }
-        //             }
-        //         }
-        //         else
+        if (pad)
+        {
+            op->params["padding_mode"] = pad->namedInput("mode");
+            op->params["padding"] = pad->namedInput("pad");
+            std::vector<int>& padding = op->params["padding"].ai;
+            if (padding.size() == 6)
+            {
+                // Conv3d only accepts tuple of three integers
+                if (padding[0] == padding[1] && padding[1] == padding[2] && padding[2] == padding[3] && padding[3] == padding[4] && padding[4] == padding[5])
+                {
+                    padding.resize(3);
+                }
+                else if (padding[0] == padding[3] && padding[1] == padding[4] && padding[2] == padding[5] && padding[0] != padding[1] && padding[1] != padding[2])
+                {
+                    padding.resize(0);
+                    op->params["padding"].s = "same";
+                }
+            }
+        }
+        else if (reflection_pad3d)
+        {
+            op->params["padding_mode"] = "reflect";
+            op->params["padding"] = reflection_pad3d->namedInput("padding");
+            std::vector<int>& padding = op->params["padding"].ai;
+            if (padding.size() == 6)
+            {
+                // Conv3d only accepts tuple of three integers
+                if (padding[0] == padding[1] && padding[1] == padding[2] && padding[2] == padding[3] && padding[3] == padding[4] && padding[4] == padding[5])
+                {
+                    padding.resize(3);
+                }
+                else if (padding[0] == padding[3] && padding[1] == padding[4] && padding[2] == padding[5] && padding[0] != padding[1] && padding[1] != padding[2])
+                {
+                    padding.resize(0);
+                    op->params["padding"].s = "same";
+                }
+            }
+        }
+        else if (replication_pad3d)
+        {
+            op->params["padding_mode"] = "replicate";
+            op->params["padding"] = replication_pad3d->namedInput("padding");
+            std::vector<int>& padding = op->params["padding"].ai;
+            if (padding.size() == 6)
+            {
+                // Conv3d only accepts tuple of three integers
+                if (padding[0] == padding[1] && padding[1] == padding[2] && padding[2] == padding[3] && padding[3] == padding[4] && padding[4] == padding[5])
+                {
+                    padding.resize(3);
+                }
+                else if (padding[0] == padding[3] && padding[1] == padding[4] && padding[2] == padding[5] && padding[0] != padding[1] && padding[1] != padding[2])
+                {
+                    padding.resize(0);
+                    op->params["padding"].s = "same";
+                }
+            }
+        }
+        else
        {
            op->params["padding_mode"] = "zeros";
            op->params["padding"] = convolution->namedInput("padding");

--- a/tools/pnnx/src/pass_level5.cpp
+++ b/tools/pnnx/src/pass_level5.cpp
@@ -20,6 +20,7 @@
 #include "pass_level5/eliminate_identity_operator.h"
 #include "pass_level5/eliminate_noop_cat.h"
 #include "pass_level5/eliminate_noop_einsum.h"
+#include "pass_level5/eliminate_noop_expand.h"
 #include "pass_level5/eliminate_noop_expression.h"
 #include "pass_level5/eliminate_noop_pad.h"
 #include "pass_level5/eliminate_noop_upsample.h"
@@ -124,6 +125,7 @@ void pass_level5(Graph& g, const std::set<std::string>& foldable_constants, cons
    eliminate_noop_view_reshape(g);

    eliminate_reshape_shape_expression(g);
+    eliminate_noop_expand(g);

    fuse_channel_shuffle(g);
    fuse_layernorm(g);

--- a/tools/pnnx/src/pass_level5/eliminate_noop_expand.cpp
+++ b/tools/pnnx/src/pass_level5/eliminate_noop_expand.cpp
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "eliminate_noop_expand.h"
+
+#include <algorithm>
+#include "pass_level2.h"
+
+namespace pnnx {
+
+void eliminate_noop_expand(Graph& graph)
+{
+    while (1)
+    {
+        bool matched = false;
+
+        for (size_t i = 0; i < graph.ops.size(); i++)
+        {
+            Operator* op = graph.ops[i];
+
+            if (op->type != "Tensor.expand_as" && op->type != "Tensor.expand")
+                continue;
+
+            Operand* expand_out = op->outputs[0];
+
+            bool all_consumers_are_expr = true;
+            for (auto& x : expand_out->consumers)
+            {
+                if (x->type != "pnnx.Expression")
+                {
+                    all_consumers_are_expr = false;
+                    break;
+                }
+            }
+
+            if (!all_consumers_are_expr)
+                continue;
+
+            // Tensor.expand_as  expand   2 1 in b in2
+            // pnnx.Expression   add      2 1 in2 b out
+
+            const std::vector<int>& inshape = op->inputs[0]->shape;
+            if (inshape.empty())
+                continue;
+
+            bool noop_expand = true;
+            for (auto& x : expand_out->consumers)
+            {
+                const std::vector<int>& outshape = x->outputs[0]->shape;
+                if (outshape.empty())
+                {
+                    noop_expand = false;
+                    break;
+                }
+
+                // check if inshape can be binary broadcast to outshape
+                if (inshape.size() != outshape.size())
+                {
+                    noop_expand = false;
+                    break;
+                }
+
+                for (size_t j = 0; j < inshape.size(); j++)
+                {
+                    if ((inshape[j] == outshape[j] && outshape[j] != -1) || inshape[j] == 1 || outshape[j] == 1)
+                        continue;
+
+                    noop_expand = false;
+                    break;
+                }
+            }
+
+            // check if our expand is the base shape
+            // so we do not drop expand for add(expand(x,shape),1.2)
+            for (auto& x : expand_out->consumers)
+            {
+                const std::vector<int>& outshape = x->outputs[0]->shape;
+
+                std::vector<int> broadcasted_shape = inshape;
+                for (const auto& r : x->inputs)
+                {
+                    if (r == expand_out)
+                        continue;
+
+                    if (r->shape.size() != inshape.size())
+                        continue;
+
+                    for (size_t j = 0; j < broadcasted_shape.size(); j++)
+                    {
+                        broadcasted_shape[j] = std::max(broadcasted_shape[j], r->shape[j]);
+                    }
+                }
+
+                if (broadcasted_shape != outshape)
+                {
+                    noop_expand = false;
+                    break;
+                }
+            }
+
+            if (!noop_expand)
+                continue;
+
+            // delete noop-like expand
+            matched = true;
+
+            for (auto& x : op->inputs)
+            {
+                x->remove_consumer(op);
+            }
+
+            for (auto& x : expand_out->consumers)
+            {
+                for (size_t j = 0; j < x->inputs.size(); j++)
+                {
+                    if (x->inputs[j] == expand_out)
+                        x->inputs[j] = op->inputs[0];
+                }
+
+                op->inputs[0]->consumers.push_back(x);
+            }
+
+            op->inputs[0]->name = expand_out->name;
+
+            expand_out->producer = 0;
+            expand_out->consumers.clear();
+
+            graph.operands.erase(std::find(graph.operands.begin(), graph.operands.end(), expand_out));
+            delete expand_out;
+
+            op->inputs.clear();
+            op->outputs.clear();
+
+            graph.ops.erase(graph.ops.begin() + i);
+            delete op;
+
+            break;
+        }
+
+        if (!matched)
+            break;
+    }
+}
+
+} // namespace pnnx
--- a/tools/pnnx/src/pass_level5/eliminate_noop_expand.h
+++ b/tools/pnnx/src/pass_level5/eliminate_noop_expand.h
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void eliminate_noop_expand(Graph& graph);
+
+} // namespace pnnx
--- a/tools/pnnx/src/pass_ncnn/nn_Conv1d.cpp
+++ b/tools/pnnx/src/pass_ncnn/nn_Conv1d.cpp
@@ -26,7 +26,7 @@ public:
        return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-nn.Conv1d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
+nn.Conv1d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
 pnnx.Output             output      1 0 out
 )PNNXIR";
    }
@@ -43,12 +43,6 @@ pnnx.Output             output      1 0 out

    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
    {
-        std::string padding_mode = captured_params.at("padding_mode").s;
-        if (padding_mode != "zeros")
-        {
-            fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
-        }
-
        op->params["0"] = captured_params.at("out_channels");
        op->params["1"] = captured_params.at("kernel_size").ai[0];
        op->params["2"] = captured_params.at("dilation").ai[0];
@@ -83,7 +77,7 @@ public:
        return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-nn.Conv1d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
+nn.Conv1d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
 pnnx.Output             output      1 0 out
 )PNNXIR";
    }
@@ -100,12 +94,6 @@ pnnx.Output             output      1 0 out

    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
    {
-        std::string padding_mode = captured_params.at("padding_mode").s;
-        if (padding_mode != "zeros")
-        {
-            fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
-        }
-
        op->params["0"] = captured_params.at("out_channels");
        op->params["1"] = captured_params.at("kernel_size").ai[0];
        op->params["2"] = captured_params.at("dilation").ai[0];
@@ -133,8 +121,151 @@ pnnx.Output             output      1 0 out
    }
 };

+class nn_Conv1d_2 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.Conv1d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+Padding                 pad         1 1 input a
+Convolution1D           conv        1 1 a out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::string& padding_mode = captured_params.at("padding_mode").s;
+        if (padding_mode == "zeros")
+            return false;
+
+        return true;
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        const Operator* conv = matched_operators.at("op_0");
+        if (conv->params.at("padding").type == 4 && conv->params.at("padding").s == "same")
+        {
+            const std::vector<int> input_shape = conv->inputs[0]->shape;
+            if (input_shape.size() != 2 && input_shape.size() != 3)
+            {
+                fprintf(stderr, "can not resolve pads without shape\n");
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        std::vector<int> padding;
+        if (captured_params.at("padding").type == 4)
+        {
+            if (captured_params.at("padding").s == "same")
+            {
+                // resolve pads
+                const std::vector<int> input_shape = ops.at("pad")->inputs[0]->shape;
+                const int w = input_shape[input_shape.size() - 1];
+                const int kernel_w = captured_params.at("kernel_size").ai[0];
+                const int dilation_w = captured_params.at("dilation").ai[0];
+                const int stride_w = captured_params.at("stride").ai[0];
+
+                const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+                int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+
+                padding = std::vector<int>{wpad / 2, wpad - wpad / 2};
+            }
+            else if (captured_params.at("padding").s == "valid")
+            {
+                padding = std::vector<int>{0, 0};
+            }
+        }
+        else
+        {
+            int wpad = captured_params.at("padding").ai[0];
+            padding = std::vector<int>{wpad, wpad};
+        }
+
+        ops.at("pad")->params["0"] = 0;
+        ops.at("pad")->params["1"] = 0;
+        ops.at("pad")->params["2"] = padding[0];
+        ops.at("pad")->params["3"] = padding[1];
+
+        std::string padding_mode = captured_params.at("padding_mode").s;
+        if (padding_mode == "reflect")
+        {
+            ops.at("pad")->params["4"] = 2; // type=reflect
+        }
+        else if (padding_mode == "replicate")
+        {
+            ops.at("pad")->params["4"] = 1; // type=replicate
+        }
+        else
+        {
+            fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
+        }
+
+        ops.at("conv")->params["0"] = captured_params.at("out_channels");
+        ops.at("conv")->params["1"] = captured_params.at("kernel_size").ai[0];
+        ops.at("conv")->params["2"] = captured_params.at("dilation").ai[0];
+        ops.at("conv")->params["3"] = captured_params.at("stride").ai[0];
+        ops.at("conv")->params["4"] = 0;
+        ops.at("conv")->params["5"] = captured_params.at("bias").b ? 1 : 0;
+        ops.at("conv")->params["6"] = captured_attrs.at("op_0.weight").elemcount();
+        ops.at("conv")->params["7"] = captured_params.find("groups") != captured_params.end() ? captured_params.at("groups") : 1;
+
+        ops.at("conv")->attrs["0"] = Attribute();
+        ops.at("conv")->attrs["0"].data = {0, 0, 0, 0};
+        ops.at("conv")->attrs["1"] = captured_attrs.at("op_0.weight");
+        if (captured_params.at("bias").b)
+            ops.at("conv")->attrs["2"] = captured_attrs.at("op_0.bias");
+    }
+};
+
+class nn_Conv1d_3 : public nn_Conv1d_2
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.Conv1d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+Padding                 pad         1 1 input a
+ConvolutionDepthWise1D  conv        1 1 a out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv1d, 20)
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv1d_1, 21)
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv1d_2, 22)
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv1d_3, 23)

 } // namespace ncnn


--- a/tools/pnnx/src/pass_ncnn/nn_Conv2d.cpp
+++ b/tools/pnnx/src/pass_ncnn/nn_Conv2d.cpp
@@ -26,7 +26,7 @@ public:
        return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-nn.Conv2d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
+nn.Conv2d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
 pnnx.Output             output      1 0 out
 )PNNXIR";
    }
@@ -43,12 +43,6 @@ pnnx.Output             output      1 0 out

    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
    {
-        std::string padding_mode = captured_params.at("padding_mode").s;
-        if (padding_mode != "zeros")
-        {
-            fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
-        }
-
        op->params["0"] = captured_params.at("out_channels");
        op->params["1"] = captured_params.at("kernel_size").ai[1];
        op->params["11"] = captured_params.at("kernel_size").ai[0];
@@ -87,7 +81,7 @@ public:
        return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-nn.Conv2d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
+nn.Conv2d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
 pnnx.Output             output      1 0 out
 )PNNXIR";
    }
@@ -104,12 +98,6 @@ pnnx.Output             output      1 0 out

    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
    {
-        std::string padding_mode = captured_params.at("padding_mode").s;
-        if (padding_mode != "zeros")
-        {
-            fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
-        }
-
        op->params["0"] = captured_params.at("out_channels");
        op->params["1"] = captured_params.at("kernel_size").ai[1];
        op->params["11"] = captured_params.at("kernel_size").ai[0];
@@ -141,8 +129,162 @@ pnnx.Output             output      1 0 out
    }
 };

+class nn_Conv2d_2 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.Conv2d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+Padding                 pad         1 1 input a
+Convolution             conv        1 1 a out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::string& padding_mode = captured_params.at("padding_mode").s;
+        if (padding_mode == "zeros")
+            return false;
+
+        return true;
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        const Operator* conv = matched_operators.at("op_0");
+        if (conv->params.at("padding").type == 4 && conv->params.at("padding").s == "same")
+        {
+            const std::vector<int> input_shape = conv->inputs[0]->shape;
+            if (input_shape.size() != 3 && input_shape.size() != 4)
+            {
+                fprintf(stderr, "can not resolve pads without shape\n");
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        std::vector<int> padding;
+        if (captured_params.at("padding").type == 4)
+        {
+            if (captured_params.at("padding").s == "same")
+            {
+                // resolve pads
+                const std::vector<int> input_shape = ops.at("pad")->inputs[0]->shape;
+                const int w = input_shape[input_shape.size() - 1];
+                const int h = input_shape[input_shape.size() - 2];
+                const int kernel_w = captured_params.at("kernel_size").ai[1];
+                const int kernel_h = captured_params.at("kernel_size").ai[0];
+                const int dilation_w = captured_params.at("dilation").ai[1];
+                const int dilation_h = captured_params.at("dilation").ai[0];
+                const int stride_w = captured_params.at("stride").ai[1];
+                const int stride_h = captured_params.at("stride").ai[0];
+
+                const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+                const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+                int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+                int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
+
+                padding = std::vector<int>{hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2};
+            }
+            else if (captured_params.at("padding").s == "valid")
+            {
+                padding = std::vector<int>{0, 0, 0, 0};
+            }
+        }
+        else
+        {
+            int hpad = captured_params.at("padding").ai[0];
+            int wpad = captured_params.at("padding").ai[1];
+            padding = std::vector<int>{hpad, hpad, wpad, wpad};
+        }
+
+        ops.at("pad")->params["0"] = padding[0];
+        ops.at("pad")->params["1"] = padding[1];
+        ops.at("pad")->params["2"] = padding[2];
+        ops.at("pad")->params["3"] = padding[3];
+
+        std::string padding_mode = captured_params.at("padding_mode").s;
+        if (padding_mode == "reflect")
+        {
+            ops.at("pad")->params["4"] = 2; // type=reflect
+        }
+        else if (padding_mode == "replicate")
+        {
+            ops.at("pad")->params["4"] = 1; // type=replicate
+        }
+        else
+        {
+            fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
+        }
+
+        ops.at("conv")->params["0"] = captured_params.at("out_channels");
+        ops.at("conv")->params["1"] = captured_params.at("kernel_size").ai[1];
+        ops.at("conv")->params["11"] = captured_params.at("kernel_size").ai[0];
+        ops.at("conv")->params["2"] = captured_params.at("dilation").ai[1];
+        ops.at("conv")->params["12"] = captured_params.at("dilation").ai[0];
+        ops.at("conv")->params["3"] = captured_params.at("stride").ai[1];
+        ops.at("conv")->params["13"] = captured_params.at("stride").ai[0];
+        ops.at("conv")->params["4"] = 0;
+        ops.at("conv")->params["14"] = 0;
+        ops.at("conv")->params["5"] = captured_params.at("bias").b ? 1 : 0;
+        ops.at("conv")->params["6"] = captured_attrs.at("op_0.weight").elemcount();
+        ops.at("conv")->params["7"] = captured_params.find("groups") != captured_params.end() ? captured_params.at("groups") : 1;
+
+        ops.at("conv")->attrs["0"] = Attribute();
+        ops.at("conv")->attrs["0"].data = {0, 0, 0, 0};
+        ops.at("conv")->attrs["1"] = captured_attrs.at("op_0.weight");
+        if (captured_params.at("bias").b)
+            ops.at("conv")->attrs["2"] = captured_attrs.at("op_0.bias");
+    }
+};
+
+class nn_Conv2d_3 : public nn_Conv2d_2
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.Conv2d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+Padding                 pad         1 1 input a
+ConvolutionDepthWise    conv        1 1 a out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv2d, 20)
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv2d_1, 21)
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv2d_2, 22)
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv2d_3, 23)

 } // namespace ncnn


--- a/tools/pnnx/src/pass_ncnn/nn_Conv3d.cpp
+++ b/tools/pnnx/src/pass_ncnn/nn_Conv3d.cpp
@@ -26,7 +26,7 @@ public:
        return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-nn.Conv3d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
+nn.Conv3d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
 pnnx.Output             output      1 0 out
 )PNNXIR";
    }
@@ -43,12 +43,6 @@ pnnx.Output             output      1 0 out

    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
    {
-        std::string padding_mode = captured_params.at("padding_mode").s;
-        if (padding_mode != "zeros")
-        {
-            fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
-        }
-
        op->params["0"] = captured_params.at("out_channels");
        op->params["1"] = captured_params.at("kernel_size").ai[2];
        op->params["11"] = captured_params.at("kernel_size").ai[1];
@@ -91,7 +85,7 @@ public:
        return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-nn.Conv3d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
+nn.Conv3d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
 pnnx.Output             output      1 0 out
 )PNNXIR";
    }
@@ -108,12 +102,6 @@ pnnx.Output             output      1 0 out

    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
    {
-        std::string padding_mode = captured_params.at("padding_mode").s;
-        if (padding_mode != "zeros")
-        {
-            fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
-        }
-
        op->params["0"] = captured_params.at("out_channels");
        op->params["1"] = captured_params.at("kernel_size").ai[2];
        op->params["11"] = captured_params.at("kernel_size").ai[1];
@@ -149,8 +137,175 @@ pnnx.Output             output      1 0 out
    }
 };

+class nn_Conv3d_2 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.Conv3d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+Padding                 pad         1 1 input a
+Convolution3D           conv        1 1 a out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::string& padding_mode = captured_params.at("padding_mode").s;
+        if (padding_mode == "zeros")
+            return false;
+
+        return true;
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        const Operator* conv = matched_operators.at("op_0");
+        if (conv->params.at("padding").type == 4 && conv->params.at("padding").s == "same")
+        {
+            const std::vector<int> input_shape = conv->inputs[0]->shape;
+            if (input_shape.size() != 4 && input_shape.size() != 5)
+            {
+                fprintf(stderr, "can not resolve pads without shape\n");
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        std::vector<int> padding;
+        if (captured_params.at("padding").type == 4)
+        {
+            if (captured_params.at("padding").s == "same")
+            {
+                // resolve pads
+                const std::vector<int> input_shape = ops.at("pad")->inputs[0]->shape;
+                const int w = input_shape[input_shape.size() - 1];
+                const int h = input_shape[input_shape.size() - 2];
+                const int d = input_shape[input_shape.size() - 3];
+                const int kernel_w = captured_params.at("kernel_size").ai[2];
+                const int kernel_h = captured_params.at("kernel_size").ai[1];
+                const int kernel_d = captured_params.at("kernel_size").ai[0];
+                const int dilation_w = captured_params.at("dilation").ai[2];
+                const int dilation_h = captured_params.at("dilation").ai[1];
+                const int dilation_d = captured_params.at("dilation").ai[0];
+                const int stride_w = captured_params.at("stride").ai[2];
+                const int stride_h = captured_params.at("stride").ai[1];
+                const int stride_d = captured_params.at("stride").ai[0];
+
+                const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+                const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+                const int kernel_extent_d = dilation_d * (kernel_d - 1) + 1;
+
+                int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+                int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
+                int dpad = kernel_extent_d + (d - 1) / stride_d * stride_d - d;
+
+                padding = std::vector<int>{hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, dpad / 2, dpad - dpad / 2};
+            }
+            else if (captured_params.at("padding").s == "valid")
+            {
+                padding = std::vector<int>{0, 0, 0, 0, 0, 0};
+            }
+        }
+        else
+        {
+            int dpad = captured_params.at("padding").ai[0];
+            int hpad = captured_params.at("padding").ai[1];
+            int wpad = captured_params.at("padding").ai[2];
+            padding = std::vector<int>{hpad, hpad, wpad, wpad, dpad, dpad};
+        }
+
+        ops.at("pad")->params["0"] = padding[0];
+        ops.at("pad")->params["1"] = padding[1];
+        ops.at("pad")->params["2"] = padding[2];
+        ops.at("pad")->params["3"] = padding[3];
+        ops.at("pad")->params["7"] = padding[4];
+        ops.at("pad")->params["8"] = padding[5];
+
+        std::string padding_mode = captured_params.at("padding_mode").s;
+        if (padding_mode == "reflect")
+        {
+            ops.at("pad")->params["4"] = 2; // type=reflect
+        }
+        else if (padding_mode == "replicate")
+        {
+            ops.at("pad")->params["4"] = 1; // type=replicate
+        }
+        else
+        {
+            fprintf(stderr, "unsupported padding_mode %s\n", padding_mode.c_str());
+        }
+
+        ops.at("conv")->params["0"] = captured_params.at("out_channels");
+        ops.at("conv")->params["1"] = captured_params.at("kernel_size").ai[2];
+        ops.at("conv")->params["11"] = captured_params.at("kernel_size").ai[1];
+        ops.at("conv")->params["21"] = captured_params.at("kernel_size").ai[0];
+        ops.at("conv")->params["2"] = captured_params.at("dilation").ai[2];
+        ops.at("conv")->params["12"] = captured_params.at("dilation").ai[1];
+        ops.at("conv")->params["22"] = captured_params.at("dilation").ai[0];
+        ops.at("conv")->params["3"] = captured_params.at("stride").ai[2];
+        ops.at("conv")->params["13"] = captured_params.at("stride").ai[1];
+        ops.at("conv")->params["23"] = captured_params.at("stride").ai[0];
+        ops.at("conv")->params["4"] = 0;
+        ops.at("conv")->params["14"] = 0;
+        ops.at("conv")->params["24"] = 0;
+        ops.at("conv")->params["5"] = captured_params.at("bias").b ? 1 : 0;
+        ops.at("conv")->params["6"] = captured_attrs.at("op_0.weight").elemcount();
+        ops.at("conv")->params["7"] = captured_params.find("groups") != captured_params.end() ? captured_params.at("groups") : 1;
+
+        ops.at("conv")->attrs["0"] = Attribute();
+        ops.at("conv")->attrs["0"].data = {0, 0, 0, 0};
+        ops.at("conv")->attrs["1"] = captured_attrs.at("op_0.weight");
+        if (captured_params.at("bias").b)
+            ops.at("conv")->attrs["2"] = captured_attrs.at("op_0.bias");
+    }
+};
+
+class nn_Conv3d_3 : public nn_Conv3d_2
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.Conv3d               op_0        1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=%padding_mode padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+Padding                 pad         1 1 input a
+ConvolutionDepthWise3D  conv        1 1 a out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv3d, 20)
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv3d_1, 21)
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv3d_2, 22)
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Conv3d_3, 23)

 } // namespace ncnn


--- a/tools/pnnx/tests/CMakeLists.txt
+++ b/tools/pnnx/tests/CMakeLists.txt
@@ -292,6 +292,7 @@ pnnx_add_test(vit_b_32)
 #pnnx_add_test(quantization_shufflenet_v2_x1_0)

 pnnx_add_test(pnnx_eliminate_noop_cat)
+pnnx_add_test(pnnx_eliminate_noop_expand)
 pnnx_add_test(pnnx_eliminate_noop_math)
 pnnx_add_test(pnnx_eliminate_noop_upsample)
 pnnx_add_test(pnnx_fold_constant)

--- a/tools/pnnx/tests/ncnn/test_ncnn_numpy_binaryop_broadcast.py
+++ b/tools/pnnx/tests/ncnn/test_ncnn_numpy_binaryop_broadcast.py
@@ -234,6 +234,9 @@ def test():
    b = test_ncnn_numpy_binaryop_broadcast_ncnn.test_inference()

    for a0, b0 in zip(a, b):
+        # allclose may auto broadcast compare
+        if a0.shape != b0.shape:
+            return False
        if not torch.allclose(a0, b0, 1e-4, 1e-4):
            return False
    return True

--- a/tools/pnnx/tests/ncnn/test_nn_Conv1d.py
+++ b/tools/pnnx/tests/ncnn/test_nn_Conv1d.py
@@ -30,8 +30,8 @@ class Model(nn.Module):
        else:
            self.conv_3 = nn.Conv1d(in_channels=24, out_channels=28, kernel_size=5, stride=1, padding='valid', dilation=1, groups=4, bias=True)
            self.conv_4 = nn.Conv1d(in_channels=28, out_channels=32, kernel_size=3, stride=1, padding='same', dilation=2, groups=2, bias=False, padding_mode='zeros')
-        #self.conv_5 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
-        #self.conv_6 = nn.Conv1d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
+        self.conv_5 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
+        self.conv_6 = nn.Conv1d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')

    def forward(self, x):
        x = self.conv_0(x)
@@ -39,8 +39,8 @@ class Model(nn.Module):
        x = self.conv_2(x)
        x = self.conv_3(x)
        x = self.conv_4(x)
-        #x = self.conv_5(x)
-        #x = self.conv_6(x)
+        x = self.conv_5(x)
+        x = self.conv_6(x)

        return x


--- a/tools/pnnx/tests/ncnn/test_nn_Conv2d.py
+++ b/tools/pnnx/tests/ncnn/test_nn_Conv2d.py
@@ -30,8 +30,8 @@ class Model(nn.Module):
        else:
            self.conv_3 = nn.Conv2d(in_channels=24, out_channels=28, kernel_size=(5,4), stride=1, padding='valid', dilation=1, groups=4, bias=True)
            self.conv_4 = nn.Conv2d(in_channels=28, out_channels=32, kernel_size=3, stride=1, padding='same', dilation=(1,2), groups=2, bias=False, padding_mode='zeros')
-        #self.conv_5 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
-        #self.conv_6 = nn.Conv2d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
+        self.conv_5 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
+        self.conv_6 = nn.Conv2d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')

    def forward(self, x):
        x = self.conv_0(x)
@@ -39,8 +39,8 @@ class Model(nn.Module):
        x = self.conv_2(x)
        x = self.conv_3(x)
        x = self.conv_4(x)
-        #x = self.conv_5(x)
-        #x = self.conv_6(x)
+        x = self.conv_5(x)
+        x = self.conv_6(x)

        return x


--- a/tools/pnnx/tests/ncnn/test_nn_Conv3d.py
+++ b/tools/pnnx/tests/ncnn/test_nn_Conv3d.py
@@ -30,9 +30,10 @@ class Model(nn.Module):
        else:
            self.conv_3 = nn.Conv3d(in_channels=24, out_channels=28, kernel_size=(5,4,3), stride=1, padding='valid', dilation=1, groups=4, bias=True)
            self.conv_4 = nn.Conv3d(in_channels=28, out_channels=32, kernel_size=3, stride=1, padding='same', dilation=(1,2,2), groups=2, bias=False, padding_mode='zeros')
-        #self.conv_5 = nn.Conv3d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
-        #self.conv_6 = nn.Conv3d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
-        #self.conv_7 = nn.Conv3d(in_channels=28, out_channels=24, kernel_size=3, stride=2, padding=(5,6), dilation=2, groups=1, bias=True, padding_mode='circular')
+        if version.parse(torch.__version__) >= version.parse('1.10'):
+            self.conv_5 = nn.Conv3d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
+            self.conv_6 = nn.Conv3d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
+            # self.conv_7 = nn.Conv3d(in_channels=28, out_channels=24, kernel_size=3, stride=2, padding=(5,6), dilation=2, groups=1, bias=True, padding_mode='circular')

    def forward(self, x):
        x = self.conv_0(x)
@@ -40,8 +41,11 @@ class Model(nn.Module):
        x = self.conv_2(x)
        x = self.conv_3(x)
        x = self.conv_4(x)
-        #x = self.conv_5(x)
-        #x = self.conv_6(x)
+        if version.parse(torch.__version__) < version.parse('1.10'):
+            return x
+
+        x = self.conv_5(x)
+        x = self.conv_6(x)
        #x = self.conv_7(x)

        return x

--- a/tools/pnnx/tests/test_nn_Conv3d.py
+++ b/tools/pnnx/tests/test_nn_Conv3d.py
@@ -30,9 +30,10 @@ class Model(nn.Module):
        else:
            self.conv_3 = nn.Conv3d(in_channels=24, out_channels=28, kernel_size=(5,4,3), stride=1, padding='valid', dilation=1, groups=4, bias=True)
            self.conv_4 = nn.Conv3d(in_channels=28, out_channels=32, kernel_size=3, stride=1, padding='same', dilation=(1,2,2), groups=2, bias=False, padding_mode='zeros')
-        #self.conv_5 = nn.Conv3d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
-        #self.conv_6 = nn.Conv3d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
-        #self.conv_7 = nn.Conv3d(in_channels=28, out_channels=24, kernel_size=3, stride=2, padding=(5,6), dilation=2, groups=1, bias=True, padding_mode='circular')
+        if version.parse(torch.__version__) >= version.parse('1.10'):
+            self.conv_5 = nn.Conv3d(in_channels=32, out_channels=32, kernel_size=2, stride=2, padding=3, dilation=1, groups=32, bias=True, padding_mode='reflect')
+            self.conv_6 = nn.Conv3d(in_channels=32, out_channels=28, kernel_size=2, stride=1, padding=2, dilation=1, groups=1, bias=False, padding_mode='replicate')
+            # self.conv_7 = nn.Conv3d(in_channels=28, out_channels=24, kernel_size=3, stride=2, padding=(5,6), dilation=2, groups=1, bias=True, padding_mode='circular')

    def forward(self, x):
        x = self.conv_0(x)
@@ -40,8 +41,11 @@ class Model(nn.Module):
        x = self.conv_2(x)
        x = self.conv_3(x)
        x = self.conv_4(x)
-        #x = self.conv_5(x)
-        #x = self.conv_6(x)
+        if version.parse(torch.__version__) < version.parse('1.10'):
+            return x
+
+        x = self.conv_5(x)
+        x = self.conv_6(x)
        #x = self.conv_7(x)

        return x

--- a/tools/pnnx/tests/test_pnnx_eliminate_noop_expand.py
+++ b/tools/pnnx/tests/test_pnnx_eliminate_noop_expand.py
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x0, x1, y0, y1, y2, y3, z0, z1, z2, z3, z4, z5, z6, z7, w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15):
+        return (x0 - x1.expand_as(x0), x1.expand(x0.size()) - x0,
+                y0 - y1.expand_as(y0), y1.expand(y0.size()) - y0,
+                y0 - y2.expand_as(y0), y2.expand(y0.size()) - y0,
+                y0 - y3.expand_as(y0), y3.expand(y0.size()) - y0,
+                y1 - y2.expand_as(y0), y2.expand(y0.size()) - y1,
+                y1 - y3.expand_as(y1), y3.expand(y1.size()) - y1,
+                y2 - y3.expand_as(y2), y3.expand(y2.size()) - y2,
+                z0 - z1.expand_as(z0), z1.expand(z0.size()) - z0,
+                z0 - z2.expand_as(z0), z2.expand(z0.size()) - z0,
+                z0 - z3.expand_as(z0), z3.expand(z0.size()) - z0,
+                z0 - z4.expand_as(z0), z4.expand(z0.size()) - z0,
+                z0 - z5.expand_as(z0), z5.expand(z0.size()) - z0,
+                z0 - z6.expand_as(z0), z6.expand(z0.size()) - z0,
+                z0 - z7.expand_as(z0), z7.expand(z0.size()) - z0,
+                z1 - z2.expand_as(z0), z2.expand(z0.size()) - z1,
+                z1 - z3.expand_as(z0), z3.expand(z0.size()) - z1,
+                z1 - z4.expand_as(z1), z4.expand(z1.size()) - z1,
+                z1 - z5.expand_as(z1), z5.expand(z1.size()) - z1,
+                z1 - z6.expand_as(z3), z6.expand(z3.size()) - z1,
+                z1 - z7.expand_as(z1), z7.expand(z1.size()) - z1,
+                z2 - z3.expand_as(z0), z3.expand(z0.size()) - z2,
+                z2 - z4.expand_as(z2), z4.expand(z2.size()) - z2,
+                z2 - z5.expand_as(z3), z5.expand(z3.size()) - z2,
+                z2 - z6.expand_as(z2), z6.expand(z2.size()) - z2,
+                z2 - z7.expand_as(z2), z7.expand(z2.size()) - z2,
+                z3 - z4.expand_as(z1), z4.expand(z1.size()) - z3,
+                z3 - z5.expand_as(z3), z5.expand(z3.size()) - z3,
+                z3 - z6.expand_as(z3), z6.expand(z3.size()) - z3,
+                z3 - z7.expand_as(z3), z7.expand(z3.size()) - z3,
+                z4 - z5.expand_as(z1), z5.expand(z1.size()) - z4,
+                z4 - z6.expand_as(z2), z6.expand(z2.size()) - z4,
+                z4 - z7.expand_as(z4), z7.expand(z4.size()) - z4,
+                z5 - z6.expand_as(z3), z6.expand(z3.size()) - z5,
+                z5 - z7.expand_as(z5), z7.expand(z5.size()) - z5,
+                z6 - z7.expand_as(z6), z7.expand(z6.size()) - z6,
+                w0 - w1.expand_as(w0), w1.expand(w0.size()) - w0,
+                w0 - w2.expand_as(w0), w2.expand(w0.size()) - w0,
+                w0 - w3.expand_as(w0), w3.expand(w0.size()) - w0,
+                w0 - w4.expand_as(w0), w4.expand(w0.size()) - w0,
+                w0 - w5.expand_as(w0), w5.expand(w0.size()) - w0,
+                w0 - w6.expand_as(w0), w6.expand(w0.size()) - w0,
+                w0 - w7.expand_as(w0), w7.expand(w0.size()) - w0,
+                w0 - w8.expand_as(w0), w8.expand(w0.size()) - w0,
+                w0 - w9.expand_as(w0), w9.expand(w0.size()) - w0,
+                w0 - w10.expand_as(w0), w10.expand(w0.size()) - w0,
+                w0 - w11.expand_as(w0), w11.expand(w0.size()) - w0,
+                w0 - w12.expand_as(w0), w12.expand(w0.size()) - w0,
+                w0 - w13.expand_as(w0), w13.expand(w0.size()) - w0,
+                w0 - w14.expand_as(w0), w14.expand(w0.size()) - w0,
+                w0 - w15.expand_as(w0), w15.expand(w0.size()) - w0,
+                w1 - w5.expand_as(w1), w5.expand(w1.size()) - w1,
+                w1 - w6.expand_as(w1), w6.expand(w1.size()) - w1,
+                w1 - w7.expand_as(w1), w7.expand(w1.size()) - w1,
+                w1 - w11.expand_as(w1), w11.expand(w1.size()) - w1,
+                w1 - w12.expand_as(w1), w12.expand(w1.size()) - w1,
+                w1 - w13.expand_as(w1), w13.expand(w1.size()) - w1,
+                w1 - w15.expand_as(w1), w15.expand(w1.size()) - w1,
+                w2 - w5.expand_as(w2), w5.expand(w2.size()) - w2,
+                w2 - w8.expand_as(w2), w8.expand(w2.size()) - w2,
+                w2 - w9.expand_as(w2), w9.expand(w2.size()) - w2,
+                w2 - w11.expand_as(w2), w11.expand(w2.size()) - w2,
+                w2 - w12.expand_as(w2), w12.expand(w2.size()) - w2,
+                w2 - w14.expand_as(w2), w14.expand(w2.size()) - w2,
+                w2 - w15.expand_as(w2), w15.expand(w2.size()) - w2,
+                w3 - w6.expand_as(w3), w6.expand(w3.size()) - w3,
+                w3 - w8.expand_as(w3), w8.expand(w3.size()) - w3,
+                w3 - w10.expand_as(w3), w10.expand(w3.size()) - w3,
+                w3 - w11.expand_as(w3), w11.expand(w3.size()) - w3,
+                w3 - w13.expand_as(w3), w13.expand(w3.size()) - w3,
+                w3 - w14.expand_as(w3), w14.expand(w3.size()) - w3,
+                w3 - w15.expand_as(w3), w15.expand(w3.size()) - w3,
+                w4 - w7.expand_as(w4), w7.expand(w4.size()) - w4,
+                w4 - w9.expand_as(w4), w9.expand(w4.size()) - w4,
+                w4 - w10.expand_as(w4), w10.expand(w4.size()) - w4,
+                w4 - w12.expand_as(w4), w12.expand(w4.size()) - w4,
+                w4 - w13.expand_as(w4), w13.expand(w4.size()) - w4,
+                w4 - w14.expand_as(w4), w14.expand(w4.size()) - w4,
+                w4 - w15.expand_as(w4), w15.expand(w4.size()) - w4,
+                w5 - w11.expand_as(w5), w11.expand(w5.size()) - w5,
+                w5 - w12.expand_as(w5), w12.expand(w5.size()) - w5,
+                w5 - w15.expand_as(w5), w15.expand(w5.size()) - w5,
+                w6 - w11.expand_as(w6), w11.expand(w6.size()) - w6,
+                w6 - w13.expand_as(w6), w13.expand(w6.size()) - w6,
+                w6 - w15.expand_as(w6), w15.expand(w6.size()) - w6,
+                w7 - w12.expand_as(w7), w12.expand(w7.size()) - w7,
+                w7 - w13.expand_as(w7), w13.expand(w7.size()) - w7,
+                w7 - w15.expand_as(w7), w15.expand(w7.size()) - w7,
+                w8 - w11.expand_as(w8), w11.expand(w8.size()) - w8,
+                w8 - w14.expand_as(w8), w14.expand(w8.size()) - w8,
+                w8 - w15.expand_as(w8), w15.expand(w8.size()) - w8,
+                w9 - w12.expand_as(w9), w12.expand(w9.size()) - w9,
+                w9 - w14.expand_as(w9), w14.expand(w9.size()) - w9,
+                w9 - w15.expand_as(w9), w15.expand(w9.size()) - w9,
+                w10 - w13.expand_as(w10), w13.expand(w10.size()) - w10,
+                w10 - w14.expand_as(w10), w14.expand(w10.size()) - w10,
+                w10 - w15.expand_as(w10), w15.expand(w10.size()) - w10,
+                w11 - w15.expand_as(w11), w15.expand(w11.size()) - w11,
+                w12 - w15.expand_as(w12), w15.expand(w12.size()) - w12,
+                w13 - w15.expand_as(w13), w15.expand(w13.size()) - w13,
+                w14 - w15.expand_as(w14), w15.expand(w14.size()) - w14,
+
+                # some negative cases
+                w11.expand_as(w5) - w14.expand_as(w10),
+                w5.expand(w1.size()) - w11,
+                w15.expand(6, 7, 8, 9) - w14
+                )
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x0 = torch.rand(5)
+    x1 = torch.rand(1)
+    y0 = torch.rand(7, 5)
+    y1 = torch.rand(1, 5)
+    y2 = torch.rand(7, 1)
+    y3 = torch.rand(1, 1)
+    z0 = torch.rand(4, 7, 5)
+    z1 = torch.rand(1, 7, 5)
+    z2 = torch.rand(4, 1, 5)
+    z3 = torch.rand(4, 7, 1)
+    z4 = torch.rand(1, 1, 5)
+    z5 = torch.rand(1, 7, 1)
+    z6 = torch.rand(4, 1, 1)
+    z7 = torch.rand(1, 1, 1)
+    w0 = torch.rand(6, 4, 7, 5)
+    w1 = torch.rand(1, 4, 7, 5)
+    w2 = torch.rand(6, 1, 7, 5)
+    w3 = torch.rand(6, 4, 1, 5)
+    w4 = torch.rand(6, 4, 7, 1)
+    w5 = torch.rand(1, 1, 7, 5)
+    w6 = torch.rand(1, 4, 1, 5)
+    w7 = torch.rand(1, 4, 7, 1)
+    w8 = torch.rand(6, 1, 1, 5)
+    w9 = torch.rand(6, 1, 7, 1)
+    w10 = torch.rand(6, 4, 1, 1)
+    w11 = torch.rand(1, 1, 1, 5)
+    w12 = torch.rand(1, 1, 7, 1)
+    w13 = torch.rand(1, 4, 1, 1)
+    w14 = torch.rand(6, 1, 1, 1)
+    w15 = torch.rand(1, 1, 1, 1)
+
+    a = net(x0, x1, y0, y1, y2, y3, z0, z1, z2, z3, z4, z5, z6, z7, w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x0, x1, y0, y1, y2, y3, z0, z1, z2, z3, z4, z5, z6, z7, w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15))
+    mod.save("test_pnnx_eliminate_noop_expand.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_pnnx_eliminate_noop_expand.pt inputshape=[5],[1],[7,5],[1,5],[7,1],[1,1],[4,7,5],[1,7,5],[4,1,5],[4,7,1],[1,1,5],[1,7,1],[4,1,1],[1,1,1],[6,4,7,5],[1,4,7,5],[6,1,7,5],[6,4,1,5],[6,4,7,1],[1,1,7,5],[1,4,1,5],[1,4,7,1],[6,1,1,5],[6,1,7,1],[6,4,1,1],[1,1,1,5],[1,1,7,1],[1,4,1,1],[6,1,1,1],[1,1,1,1]")
+
+    # pnnx inference
+    import test_pnnx_eliminate_noop_expand_pnnx
+    b = test_pnnx_eliminate_noop_expand_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        # allclose may auto broadcast compare
+        if a0.shape != b0.shape:
+            return False
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)