未验证 提交 e03e2e7f 编写于 作者: Z Zihao Mu 提交者: GitHub

Merge pull request #23192 from zihaomu:clean_up_SIMD_code

### Purpose of this PR:
- Move all dispatch and SIMD code of `convolution layer` into `simd.hpp` file.
- Support Winograd at AVX-only machine.
- Re-name the folder from `fast_conv` to `cpu_kernels`. In the future, we can put other layers of CPU optimization into it, like `GEMM` or `MatMul`.

## Performance Test
Since this patch just focuses on the code style, the performance is expected as the same as before.
Test with the following script: 
`./bin/opencv_perf_dnn '--gtest_filter=*conv*' --gtest_output="xml:../1-0th.xml" --perf_threads=1`

### Test on X86 platform
Min (ms)
|Name of Test|4.x | patch | 4.x vs patch (x-factor)|
|---|:-:|:-:|:-:|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|0.98|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|0.95|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.001|0.001|0.97|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.002|0.002|1.04|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.002|0.002|0.94|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.040|0.044|0.93|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.010|0.010|1.00|
|conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.106|0.103|1.03|
|conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.041|0.040|1.03|
|conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.340|0.329|1.03|
|conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.590|0.567|1.04|
|conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.374|1.314|1.05|
|conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.715|3.528|1.05|
|conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.181|1.166|1.01|
|conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.689|2.587|1.04|
|conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.754|4.500|1.06|
|conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|9.612|9.112|1.05|
|conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|69.000|64.676|1.07|
|conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|20.248|18.451|1.10|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|1.395|1.392|1.00|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|1.990|1.984|1.00|
|conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.393|1.360|1.02|
|conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|1.813|1.744|1.04|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.190|1.191|1.00|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.286|1.284|1.00|
|conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.295|2.279|1.01|
|conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.322|1.331|0.99|
|conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|3.784|3.533|1.07|
|conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.838|1.844|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.957|1.959|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|2.596|2.573|1.01|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|4.183|4.083|1.02|
|conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|2.413|2.406|1.00|
|conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|2.538|2.546|1.00|
|conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|2.972|2.980|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|3.452|3.464|1.00|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|3.082|3.105|0.99|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|4.043|3.919|1.03|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|5.538|5.531|1.00|
|conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.393|3.418|0.99|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|4.325|4.234|1.02|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|6.009|5.908|1.02|
|conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|6.557|6.376|1.03|
|conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|10.114|9.472|1.07|
|conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|10.373|9.879|1.05|
|conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|12.782|11.624|1.10|
|conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|90.931|90.552|1.00|
|conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|6.091|5.818|1.05|
|conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|7.083|6.643|1.07|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.054|5.059|1.00|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|5.005|4.931|1.02|
|conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|4.951|5.065|0.98|
|conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|11.957|11.293|1.06|
|conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|5.328|5.250|1.01|
|conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|5.544|5.292|1.05|
|conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|6.186|5.893|1.05|
|conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|6.153|5.834|1.05|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|8.154|8.107|1.01|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|12.699|12.256|1.04|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|11.355|11.217|1.01|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.062|17.814|1.07|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|6.820|6.531|1.04|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|14.502|13.483|1.08|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|6.270|6.123|1.02|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|13.173|12.451|1.06|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|8.326|7.652|1.09|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|17.605|16.465|1.07|
|conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|15.675|14.771|1.06|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.420|0.423|0.99|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|6.788|6.491|1.05|
|conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|6.456|6.168|1.05|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.263|0.261|1.01|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|7.690|7.398|1.04|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.200|0.202|0.99|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.542|10.464|1.01|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|10.876|10.728|1.01|
|conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|7.194|6.768|1.06|
|conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|7.099|6.731|1.05|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.147|0.162|0.91|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|18.558|17.141|1.08|
|conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|7.641|7.219|1.06|
|conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|22.666|20.999|1.08|
|conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|8.523|7.921|1.08|
|conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|8.514|8.109|1.05|
|conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|8.300|7.878|1.05|
|conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|13.403|13.131|1.02|
|conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|8.920|8.357|1.07|
|conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|28.827|27.616|1.04|
|conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|12.895|12.670|1.02|
|conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|14.120|13.078|1.08|
|conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|27.541|27.582|1.00|
|conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|32.367|31.140|1.04|
|conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|14.934|14.910|1.00|
|conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|18.289|18.491|0.99|
|conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|37.857|36.845|1.03|
|conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|37.402|36.566|1.02|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|19.031|19.164|0.99|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|19.019|19.135|0.99|
|conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|20.077|19.400|1.03|
|conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.883|21.302|1.03|
|conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|51.288|49.851|1.03|
|conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|27.349|28.359|0.96|
|conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|24.915|25.130|0.99|
|conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|25.488|25.899|0.98|
|conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|27.346|27.390|1.00|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|28.033|28.301|0.99|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|50.216|49.970|1.00|
|conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|29.670|29.513|1.01|
|conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|50.565|49.634|1.02|
|conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|37.900|37.814|1.00|
|conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|41.367|39.742|1.04|
|conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|49.128|50.350|0.98|
|conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|79.643|80.645|0.99|
|conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|41.439|40.895|1.01|
|conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|46.504|46.220|1.01|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|98.086|96.842|1.01|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|102.447|97.299|1.05|
|conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|145.047|144.996|1.00|
|conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|206.104|195.543|1.05|


### Test on M1(ARM) platform
|Name of Test|4.x|patch|4.x vs patch (x-factor)|
|---|:-:|:-:|:-:|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 19}, OCN=2, G=2, S=2, P=(1, 1), BIAS, OCV/CPU)|0.001|0.001|0.97|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 2, 25}, OCN=2, G=2, P=(2, 2), PM=SAME, OCV/CPU)|0.001|0.001|0.94|
|conv1d::Conv1D::(GFLOPS=0.000, K=[3], IN={1, 6, 10}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.002|0.002|0.92|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 4, 9, 10, 10}, OCN=4, S=[1 x 1 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.003|0.003|1.00|
|conv3d::Conv3D::(GFLOPS=0.000, K=[1 x 1 x 1], IN={1, 8, 1, 10, 10}, OCN=8, G=8, P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.003|0.003|1.00|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 3 x 3], IN={1, 2, 19, 19, 19}, OCN=2, G=2, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), BIAS, OCV/CPU)|0.031|0.031|1.00|
|conv3d::Conv3D::(GFLOPS=0.000, K=[3 x 4 x 2], IN={1, 4, 8, 10, 10}, OCN=4, G=4, S=[1 x 2 x 1], BIAS, OCV/CPU)|0.009|0.009|1.00|
|conv3d::Conv3D::(GFLOPS=0.001, K=[3 x 3 x 3], IN={1, 2, 25, 19, 19}, OCN=2, G=2, S=[1 x 2 x 2], P=(2, 2) x (2, 2) x (2, 2), PM=SAME, OCV/CPU)|0.066|0.066|1.01|
|conv3d::Conv3D::(GFLOPS=0.002, K=[3 x 1 x 4], IN={1, 14, 5, 10, 10}, OCN=14, PM=SAME, OCV/CPU)|0.102|0.102|1.00|
|conv3d::Conv3D::(GFLOPS=0.006, K=[5 x 5 x 5], IN={1, 4, 50, 19, 19}, OCN=4, S=[2 x 2 x 2], P=(1, 1) x (1, 1) x (1, 1), PM=VALID, OCV/CPU)|0.328|0.328|1.00|
|conv3d::Conv3D::(GFLOPS=0.027, K=[3 x 3 x 3], IN={1, 6, 10, 38, 50}, OCN=6, PM=VALID, BIAS, OCV/CPU)|0.693|0.747|0.93|
|conv3d::Conv3D::(GFLOPS=0.030, K=[5 x 5 x 5], IN={1, 6, 19, 19, 19}, OCN=6, G=2, OCV/CPU)|1.268|1.266|1.00|
|conv3d::Conv3D::(GFLOPS=0.045, K=[7 x 7 x 7], IN={1, 2, 38, 38, 38}, OCN=2, S=[1 x 2 x 1], OCV/CPU)|3.530|3.581|0.99|
|conv3d::Conv3D::(GFLOPS=0.053, K=[3 x 3 x 3], IN={1, 10, 98, 10, 10}, OCN=10, PM=SAME, OCV/CPU)|1.186|1.188|1.00|
|conv3d::Conv3D::(GFLOPS=0.071, K=[7 x 7 x 7], IN={1, 6, 15, 19, 19}, OCN=6, S=[2 x 1 x 1], P=(3, 3) x (3, 3) x (3, 3), PM=SAME, BIAS, OCV/CPU)|2.682|2.683|1.00|
|conv3d::Conv3D::(GFLOPS=0.093, K=[5 x 5 x 5], IN={1, 4, 40, 75, 75}, OCN=4, S=[2 x 2 x 2], OCV/CPU)|4.490|4.501|1.00|
|conv3d::Conv3D::(GFLOPS=0.116, K=[5 x 5 x 5], IN={1, 2, 21, 75, 100}, OCN=2, BIAS, OCV/CPU)|8.914|8.938|1.00|
|conv3d::Conv3D::(GFLOPS=1.267, K=[5 x 5 x 5], IN={1, 3, 75, 75, 100}, OCN=3, PM=SAME, BIAS, OCV/CPU)|69.819|69.876|1.00|
|conv3d::Conv3D::(GFLOPS=1.343, K=[3 x 3 x 3], IN={1, 11, 9, 150, 200}, OCN=11, PM=VALID, BIAS, OCV/CPU)|24.058|22.420|1.07|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 512, 26, 26}, OCN=256, OCV/CPU)|2.240|2.236|1.00|
|conv::Conv::(GFLOPS=0.177, K=[1 x 1], IN={1, 1024, 13, 13}, OCN=512, OCV/CPU)|3.132|3.136|1.00|
|conv::Conv::(GFLOPS=0.178, K=[1 x 1], IN={1, 256, 52, 52}, OCN=128, OCV/CPU)|1.920|1.919|1.00|
|conv::Conv::(GFLOPS=0.210, K=[1 x 1], IN={1, 576, 38, 50}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.343|2.346|1.00|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 128, 56, 56}, OCN=32, P=[1 x 1], OCV/CPU)|1.234|1.116|1.11|
|conv::Conv::(GFLOPS=0.231, K=[3 x 3], IN={1, 256, 14, 14}, OCN=256, P=[1 x 1], OCV/CPU)|1.109|1.121|0.99|
|conv::Conv::(GFLOPS=0.280, K=[1 x 1], IN={1, 576, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|3.197|3.084|1.04|
|conv::Conv::(GFLOPS=0.302, K=[3 x 3], IN={1, 64, 64, 64}, OCN=64, PM=SAME, OCV/CPU)|1.123|1.148|0.98|
|conv::Conv::(GFLOPS=0.357, K=[1 x 1], IN={1, 64, 208, 208}, OCN=64, OCV/CPU)|4.836|5.061|0.96|
|conv::Conv::(GFLOPS=0.420, K=[3 x 3], IN={1, 96, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.535|1.463|1.05|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 128, 40, 40}, OCN=128, PM=SAME, OCV/CPU)|1.756|1.584|1.11|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 256, 20, 20}, OCN=256, PM=SAME, OCV/CPU)|1.821|1.820|1.00|
|conv::Conv::(GFLOPS=0.472, K=[3 x 3], IN={1, 512, 10, 10}, OCN=512, PM=SAME, OCV/CPU)|7.049|6.672|1.06|
|conv::Conv::(GFLOPS=0.561, K=[3 x 3], IN={1, 128, 38, 50}, OCN=128, PM=SAME, BIAS, OCV/CPU)|1.967|1.922|1.02|
|conv::Conv::(GFLOPS=0.624, K=[3 x 3], IN={1, 128, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|1.943|1.977|0.98|
|conv::Conv::(GFLOPS=0.701, K=[3 x 3], IN={1, 128, 38, 50}, OCN=160, PM=SAME, BIAS, OCV/CPU)|2.464|2.310|1.07|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 64, 104, 104}, OCN=64, P=[1 x 1], OCV/CPU)|2.860|2.904|0.98|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 128, 52, 52}, OCN=128, P=[1 x 1], OCV/CPU)|2.428|2.483|0.98|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 256, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|2.955|2.983|0.99|
|conv::Conv::(GFLOPS=0.798, K=[3 x 3], IN={1, 512, 13, 13}, OCN=512, P=[1 x 1], OCV/CPU)|4.328|4.484|0.97|
|conv::Conv::(GFLOPS=0.830, K=[3 x 3], IN={1, 64, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|2.712|2.778|0.98|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 192, 38, 38}, OCN=192, PM=SAME, OCV/CPU)|3.205|3.331|0.96|
|conv::Conv::(GFLOPS=0.958, K=[3 x 3], IN={1, 384, 19, 19}, OCN=384, PM=SAME, OCV/CPU)|4.193|4.412|0.95|
|conv::Conv::(GFLOPS=1.022, K=[3 x 3], IN={1, 576, 19, 19}, OCN=273, PM=SAME, BIAS, OCV/CPU)|5.026|4.565|1.10|
|conv::Conv::(GFLOPS=1.112, K=[3 x 3], IN={1, 512, 10, 10}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|14.490|14.213|1.02|
|conv::Conv::(GFLOPS=1.181, K=[3 x 3], IN={1, 64, 160, 200}, OCN=128, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|14.886|14.003|1.06|
|conv::Conv::(GFLOPS=1.182, K=[3 x 3], IN={1, 32, 320, 400}, OCN=64, S=[2 x 2], P=[1 x 1], BIAS, OCV/CPU)|15.923|15.184|1.05|
|conv::Conv::(GFLOPS=1.195, K=[9 x 9], IN={1, 32, 240, 320}, OCN=3, P=[4 x 4], BIAS, OCV/CPU)|45.136|41.696|1.08|
|conv::Conv::(GFLOPS=1.196, K=[3 x 3], IN={1, 384, 26, 26}, OCN=256, P=[1 x 1], OCV/CPU)|4.995|4.631|1.08|
|conv::Conv::(GFLOPS=1.210, K=[3 x 3], IN={1, 32, 256, 256}, OCN=32, PM=SAME, OCV/CPU)|6.402|6.261|1.02|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 64, 75, 75}, OCN=192, PM=SAME, BIAS, OCV/CPU)|4.478|3.965|1.13|
|conv::Conv::(GFLOPS=1.245, K=[3 x 3], IN={1, 96, 75, 100}, OCN=96, PM=SAME, BIAS, OCV/CPU)|3.908|3.978|0.98|
|conv::Conv::(GFLOPS=1.248, K=[3 x 3], IN={1, 256, 46, 46}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|4.176|4.206|0.99|
|conv::Conv::(GFLOPS=1.258, K=[3 x 3], IN={1, 1280, 10, 10}, OCN=546, PM=SAME, BIAS, OCV/CPU)|21.509|21.136|1.02|
|conv::Conv::(GFLOPS=1.261, K=[3 x 3], IN={1, 192, 38, 50}, OCN=192, PM=SAME, BIAS, OCV/CPU)|4.426|4.082|1.08|
|conv::Conv::(GFLOPS=1.416, K=[3 x 3], IN={1, 128, 62, 82}, OCN=128, BIAS, OCV/CPU)|4.098|4.289|0.96|
|conv::Conv::(GFLOPS=1.500, K=[3 x 3], IN={1, 128, 64, 84}, OCN=128, BIAS, OCV/CPU)|4.646|5.105|0.91|
|conv::Conv::(GFLOPS=1.586, K=[3 x 3], IN={1, 128, 66, 86}, OCN=128, BIAS, OCV/CPU)|4.746|4.724|1.00|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 26, 26}, OCN=512, P=[1 x 1], OCV/CPU)|5.614|5.779|0.97|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 256, 52, 52}, OCN=512, S=[2 x 2], P=[1 x 1], OCV/CPU)|21.909|20.718|1.06|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 13, 13}, OCN=1024, P=[1 x 1], OCV/CPU)|8.256|8.290|1.00|
|conv::Conv::(GFLOPS=1.595, K=[3 x 3], IN={1, 512, 26, 26}, OCN=1024, S=[2 x 2], P=[1 x 1], OCV/CPU)|25.196|23.267|1.08|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 104, 104}, OCN=128, P=[1 x 1], OCV/CPU)|5.721|5.172|1.11|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 64, 208, 208}, OCN=128, S=[2 x 2], P=[1 x 1], OCV/CPU)|20.066|18.322|1.10|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 52, 52}, OCN=256, P=[1 x 1], OCV/CPU)|4.448|4.542|0.98|
|conv::Conv::(GFLOPS=1.596, K=[3 x 3], IN={1, 128, 104, 104}, OCN=256, S=[2 x 2], P=[1 x 1], OCV/CPU)|19.193|19.013|1.01|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 208, 208}, OCN=64, P=[1 x 1], OCV/CPU)|6.009|5.964|1.01|
|conv::Conv::(GFLOPS=1.598, K=[3 x 3], IN={1, 32, 416, 416}, OCN=64, S=[2 x 2], P=[1 x 1], OCV/CPU)|20.169|20.009|1.01|
|conv::Conv::(GFLOPS=1.659, K=[3 x 3], IN={1, 960, 10, 10}, OCN=960, PM=SAME, OCV/CPU)|22.584|23.423|0.96|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, G=128, P=[1 x 1], BIAS, OCV/CPU)|0.372|0.504|0.74|
|conv::Conv::(GFLOPS=1.660, K=[3 x 3], IN={1, 128, 75, 75}, OCN=128, PM=SAME, OCV/CPU)|5.426|5.456|0.99|
|conv::Conv::(GFLOPS=1.675, K=[3 x 3], IN={1, 128, 68, 88}, OCN=128, BIAS, OCV/CPU)|4.945|5.221|0.95|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, G=256, P=[1 x 1], BIAS, OCV/CPU)|0.210|0.261|0.81|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 256, 38, 38}, OCN=256, PM=SAME, OCV/CPU)|5.720|5.997|0.95|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, G=512, P=[1 x 1], BIAS, OCV/CPU)|0.149|0.161|0.93|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|7.154|7.225|0.99|
|conv::Conv::(GFLOPS=1.704, K=[3 x 3], IN={1, 512, 19, 19}, OCN=512, PM=SAME, OCV/CPU)|7.184|7.223|0.99|
|conv::Conv::(GFLOPS=1.766, K=[3 x 3], IN={1, 128, 70, 90}, OCN=128, BIAS, OCV/CPU)|5.324|5.343|1.00|
|conv::Conv::(GFLOPS=1.859, K=[3 x 3], IN={1, 128, 72, 92}, OCN=128, BIAS, OCV/CPU)|5.114|5.238|0.98|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, G=1024, P=[1 x 1], BIAS, OCV/CPU)|0.111|0.121|0.92|
|conv::Conv::(GFLOPS=1.888, K=[3 x 3], IN={1, 1024, 10, 10}, OCN=1024, PM=SAME, OCV/CPU)|25.907|26.804|0.97|
|conv::Conv::(GFLOPS=1.954, K=[3 x 3], IN={1, 128, 74, 94}, OCN=128, BIAS, OCV/CPU)|5.695|5.654|1.01|
|conv::Conv::(GFLOPS=1.995, K=[9 x 9], IN={1, 3, 320, 400}, OCN=32, P=[4 x 4], BIAS, OCV/CPU)|27.435|27.566|1.00|
|conv::Conv::(GFLOPS=2.052, K=[3 x 3], IN={1, 128, 76, 96}, OCN=128, BIAS, OCV/CPU)|6.944|6.164|1.13|
|conv::Conv::(GFLOPS=2.100, K=[3 x 3], IN={1, 144, 75, 75}, OCN=144, PM=SAME, OCV/CPU)|7.180|6.717|1.07|
|conv::Conv::(GFLOPS=2.153, K=[3 x 3], IN={1, 128, 78, 98}, OCN=128, BIAS, OCV/CPU)|6.817|6.050|1.13|
|conv::Conv::(GFLOPS=2.156, K=[3 x 3], IN={1, 576, 19, 19}, OCN=576, PM=SAME, OCV/CPU)|9.225|8.660|1.07|
|conv::Conv::(GFLOPS=2.255, K=[3 x 3], IN={1, 128, 80, 100}, OCN=128, BIAS, OCV/CPU)|7.496|6.625|1.13|
|conv::Conv::(GFLOPS=2.719, K=[3 x 3], IN={1, 96, 256, 256}, OCN=96, S=[2 x 2], PM=SAME, OCV/CPU)|35.520|36.056|0.99|
|conv::Conv::(GFLOPS=3.319, K=[3 x 3], IN={1, 128, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|9.990|9.702|1.03|
|conv::Conv::(GFLOPS=3.321, K=[3 x 3], IN={1, 64, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|10.517|10.746|0.98|
|conv::Conv::(GFLOPS=3.398, K=[7 x 7], IN={1, 128, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|36.702|36.731|1.00|
|conv::Conv::(GFLOPS=3.407, K=[3 x 3], IN={1, 512, 19, 19}, OCN=1024, D=[6 x 6], P=[6 x 6], BIAS, OCV/CPU)|41.035|38.280|1.07|
|conv::Conv::(GFLOPS=3.408, K=[3 x 3], IN={1, 256, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|10.981|10.573|1.04|
|conv::Conv::(GFLOPS=4.247, K=[3 x 3], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|12.863|12.384|1.04|
|conv::Conv::(GFLOPS=4.247, K=[5 x 5], IN={1, 144, 128, 128}, OCN=144, S=[2 x 2], PM=SAME, OCV/CPU)|50.437|54.088|0.93|
|conv::Conv::(GFLOPS=4.566, K=[7 x 7], IN={1, 172, 46, 46}, OCN=128, P=[3 x 3], BIAS, OCV/CPU)|50.650|50.635|1.00|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 256, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|14.696|14.606|1.01|
|conv::Conv::(GFLOPS=4.993, K=[3 x 3], IN={1, 512, 46, 46}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|16.201|15.426|1.05|
|conv::Conv::(GFLOPS=4.994, K=[3 x 3], IN={1, 128, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|16.061|14.292|1.12|
|conv::Conv::(GFLOPS=4.997, K=[3 x 3], IN={1, 64, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|17.743|18.250|0.97|
|conv::Conv::(GFLOPS=5.780, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, S=[2 x 2], PM=SAME, OCV/CPU)|77.909|78.165|1.00|
|conv::Conv::(GFLOPS=6.116, K=[3 x 3], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|21.579|21.879|0.99|
|conv::Conv::(GFLOPS=6.118, K=[3 x 3], IN={1, 144, 128, 128}, OCN=144, PM=SAME, OCV/CPU)|20.424|19.589|1.04|
|conv::Conv::(GFLOPS=6.637, K=[3 x 3], IN={1, 256, 75, 75}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|19.389|19.461|1.00|
|conv::Conv::(GFLOPS=6.638, K=[3 x 3], IN={1, 128, 150, 150}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|21.319|20.358|1.05|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 150, 200}, OCN=192, PM=SAME, BIAS, OCV/CPU)|22.609|21.826|1.04|
|conv::Conv::(GFLOPS=6.641, K=[3 x 3], IN={1, 64, 300, 300}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|25.497|25.789|0.99|
|conv::Conv::(GFLOPS=6.814, K=[3 x 3], IN={1, 512, 38, 38}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|21.966|22.108|0.99|
|conv::Conv::(GFLOPS=8.025, K=[3 x 3], IN={1, 1024, 19, 19}, OCN=1206, P=[1 x 1], BIAS, OCV/CPU)|35.883|33.470|1.07|
|conv::Conv::(GFLOPS=9.986, K=[3 x 3], IN={1, 512, 46, 46}, OCN=512, P=[1 x 1], BIAS, OCV/CPU)|31.041|29.314|1.06|
|conv::Conv::(GFLOPS=9.987, K=[3 x 3], IN={1, 256, 92, 92}, OCN=256, P=[1 x 1], BIAS, OCV/CPU)|29.922|28.145|1.06|
|conv::Conv::(GFLOPS=9.989, K=[3 x 3], IN={1, 128, 184, 184}, OCN=128, P=[1 x 1], BIAS, OCV/CPU)|31.624|31.148|1.02|
|conv::Conv::(GFLOPS=9.993, K=[3 x 3], IN={1, 64, 368, 368}, OCN=64, P=[1 x 1], BIAS, OCV/CPU)|38.564|39.164|0.98|
|conv::Conv::(GFLOPS=10.087, K=[3 x 3], IN={1, 576, 38, 50}, OCN=512, PM=SAME, BIAS, OCV/CPU)|31.502|30.269|1.04|
|conv::Conv::(GFLOPS=10.701, K=[3 x 3], IN={1, 512, 38, 38}, OCN=804, P=[1 x 1], BIAS, OCV/CPU)|34.248|34.589|0.99|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 240, 64, 64}, OCN=240, PM=SAME, OCV/CPU)|130.211|134.120|0.97|
|conv::Conv::(GFLOPS=11.797, K=[5 x 5], IN={1, 480, 32, 32}, OCN=480, PM=SAME, OCV/CPU)|127.490|132.874|0.96|
|conv::Conv::(GFLOPS=16.987, K=[5 x 5], IN={1, 1152, 16, 16}, OCN=1152, PM=SAME, OCV/CPU)|199.834|200.081|1.00|
|conv::Conv::(GFLOPS=23.122, K=[5 x 5], IN={1, 672, 32, 32}, OCN=672, PM=SAME, OCV/CPU)|247.346|247.523|1.00|


### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake


```
force_builders=Linux AVX2,Custom Win
build_image:Custom Win=msvs2019
CPU_BASELINE:Custom Win=AVX512_SKX
```
上级 c6e5f605
......@@ -10,6 +10,9 @@ set(the_description "Deep neural network module. It allows to load models from d
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2)
ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
......
......@@ -72,7 +72,7 @@ using namespace cv::dnn::ocl4dnn;
using namespace cv::dnn::cuda4dnn;
#endif
#include "fast_convolution/fast_convolution.hpp"
#include "cpu_kernels/convolution.hpp"
namespace cv
{
......
......@@ -2,409 +2,102 @@
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_FAST_CONVOLUTION_SIMD_HPP
#define OPENCV_FAST_CONVOLUTION_SIMD_HPP
#include "opencv2/core/hal/intrin.hpp"
#include <opencv2/core/utils/logger.hpp>
namespace cv {
namespace dnn {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen)
{
std::vector<float> cbuffer(outLen, 0);
float* cbuf = cbuffer.data();
for( int p = 0; p < np; p++ )
{
float ai = a[p];
for( int j = 0; j < outLen; j++ )
cbuf[j] += b[CONV_NR*p + j] * ai;
}
if (init_c)
{
for(int j = 0; j < outLen; j++)
{
c[j] += cbuf[j] + bias;
if (ifMinMaxAct)
c[j] = std::min(std::max(c[j], minval), maxval);
}
}
else
{
for(int j = 0; j < outLen; j++)
{
c[j] = cbuf[j] + bias;
if (ifMinMaxAct)
c[j] = std::min(std::max(c[j], minval), maxval);
}
}
}
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR);
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int outLen)
{
#if CV_SIMD128
// The outLen represents the valid output value in CONV_NR length.
// When outLen is very small, we use the no-SIMD branch.
const int CONV_NRby3 = CONV_NR/3;
if (outLen > CONV_NRby3)
{
v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0; // CONV_NR == 12
#if CONV_NR == 28 || CONV_NR == 24
v_float32x4 c3 = c0, c4 = c0, c5 = c0;
#endif
#if CONV_NR == 28
v_float32x4 c6 = c0;
#endif
for (int p = 0; p < np; p++, a++, b += CONV_NR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
#if CONV_NR == 28 || CONV_NR == 24
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
#endif
#if CONV_NR == 28
v_float32x4 b6 = v_load(b + 24);
#endif
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b2, a0, c2);
#if CONV_NR == 28 || CONV_NR == 24
c3 = v_fma(b3, a0, c3);
c4 = v_fma(b4, a0, c4);
c5 = v_fma(b5, a0, c5);
#endif
#if CONV_NR == 28
c6 = v_fma(b6, a0, c6);
#if !CV_FMA3 // AVX workaround
#undef _mm256_fmadd_ps
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
#endif
}
if (init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
#if CONV_NR == 28 || CONV_NR == 24
c3 += v_load(c + 12);
c4 += v_load(c + 16);
c5 += v_load(c + 20);
#endif
#if CONV_NR == 28
c6 += v_load(c + 24);
#endif
}
if (ifMinMaxAct)
{
v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
c0 = v_min(v_max(c0, vmin), vmax);
c1 = v_min(v_max(c1, vmin), vmax);
c2 = v_min(v_max(c2, vmin), vmax);
#if CONV_NR == 28 || CONV_NR == 24
c3 = v_min(v_max(c3, vmin), vmax);
c4 = v_min(v_max(c4, vmin), vmax);
c5 = v_min(v_max(c5, vmin), vmax);
#endif
#if CONV_NR == 28
c6 = v_min(v_max(c6, vmin), vmax);
#endif
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + 8, c2);
#if CONV_NR == 28 || CONV_NR == 24
v_store(c + 12, c3);
v_store(c + 16, c4);
v_store(c + 20, c5);
#endif
#if CONV_NR == 28
v_store(c + 24, c6);
#endif
}
else
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen);
#else
convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen);
#endif
}
#if CV_SIMD128
#if CONV_MR == 4 && CONV_NR == 24
static void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
{
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0;
v_float32x4 c6 = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6;
v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12;
v_float32x4 c18 = v_setzero_f32(), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18;
CV_Assert(convMR == 4 && convNR == 24);
__m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00;
__m256 c10 = c00, c11 = c00, c12 = c00;
__m256 c20 = c00, c21 = c00, c22 = c00;
__m256 c30 = c00, c31 = c00, c32 = c00;
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c2 = v_fma(b2, a0, c2);
c3 = v_fma(b3, a0, c3);
c4 = v_fma(b4, a0, c4);
c5 = v_fma(b5, a0, c5);
a0 = v_setall_f32(a[1]);
c6 = v_fma(b0, a0, c6);
c7 = v_fma(b1, a0, c7);
c8 = v_fma(b2, a0, c8);
c9 = v_fma(b3, a0, c9);
c10 = v_fma(b4, a0, c10);
c11 = v_fma(b5, a0, c11);
a0 = v_setall_f32(a[2]);
c12 = v_fma(b0, a0, c12);
c13 = v_fma(b1, a0, c13);
c14 = v_fma(b2, a0, c14);
c15 = v_fma(b3, a0, c15);
c16 = v_fma(b4, a0, c16);
c17 = v_fma(b5, a0, c17);
a0 = v_setall_f32(a[3]);
c18 = v_fma(b0, a0, c18);
c19 = v_fma(b1, a0, c19);
c20 = v_fma(b2, a0, c20);
c21 = v_fma(b3, a0, c21);
c22 = v_fma(b4, a0, c22);
c23 = v_fma(b5, a0, c23);
}
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + 8);
c3 += v_load(c + 12);
c4 += v_load(c + 16);
c5 += v_load(c + 20);
c6 += v_load(c + ldc);
c7 += v_load(c + ldc + 4);
c8 += v_load(c + ldc + 8);
c9 += v_load(c + ldc + 12);
c10 += v_load(c + ldc + 16);
c11 += v_load(c + ldc + 20);
c12 += v_load(c + ldc*2);
c13 += v_load(c + ldc*2 + 4);
c14 += v_load(c + ldc*2 + 8);
c15 += v_load(c + ldc*2 + 12);
c16 += v_load(c + ldc*2 + 16);
c17 += v_load(c + ldc*2 + 20);
c18 += v_load(c + ldc*3);
c19 += v_load(c + ldc*3 + 4);
c20 += v_load(c + ldc*3 + 8);
c21 += v_load(c + ldc*3 + 12);
c22 += v_load(c + ldc*3 + 16);
c23 += v_load(c + ldc*3 + 20);
}
__m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
__m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + 8, c2);
v_store(c + 12, c3);
v_store(c + 16, c4);
v_store(c + 20, c5);
v_store(c + ldc, c6);
v_store(c + ldc + 4, c7);
v_store(c + ldc + 8, c8);
v_store(c + ldc + 12, c9);
v_store(c + ldc + 16, c10);
v_store(c + ldc + 20, c11);
v_store(c + ldc * 2, c12);
v_store(c + ldc * 2 + 4, c13);
v_store(c + ldc * 2 + 8, c14);
v_store(c + ldc * 2 + 12, c15);
v_store(c + ldc * 2 + 16, c16);
v_store(c + ldc * 2 + 20, c17);
v_store(c + ldc * 3, c18);
v_store(c + ldc * 3 + 4, c19);
v_store(c + ldc * 3 + 8, c20);
v_store(c + ldc * 3 + 12, c21);
v_store(c + ldc * 3 + 16, c22);
v_store(c + ldc * 3 + 20, c23);
}
#endif
static void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
{
CV_Assert(CONV_NR >= 4);
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0;
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 a1 = v_setall_f32(a[1]);
v_float32x4 a2 = v_setall_f32(a[2]);
v_float32x4 a3 = v_setall_f32(a[3]);
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4);
c00 = _mm256_fmadd_ps(b0, a0, c00);
c01 = _mm256_fmadd_ps(b1, a0, c01);
c02 = _mm256_fmadd_ps(b2, a0, c02);
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b1, a0, c1);
c10 = _mm256_fmadd_ps(b0, a1, c10);
c11 = _mm256_fmadd_ps(b1, a1, c11);
c12 = _mm256_fmadd_ps(b2, a1, c12);
c2 = v_fma(b0, a1, c2);
c3 = v_fma(b1, a1, c3);
a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
c4 = v_fma(b0, a2, c4);
c5 = v_fma(b1, a2, c5);
c20 = _mm256_fmadd_ps(b0, a0, c20);
c21 = _mm256_fmadd_ps(b1, a0, c21);
c22 = _mm256_fmadd_ps(b2, a0, c22);
c6 = v_fma(b0, a3, c6);
c7 = v_fma(b1, a3, c7);
c30 = _mm256_fmadd_ps(b0, a1, c30);
c31 = _mm256_fmadd_ps(b1, a1, c31);
c32 = _mm256_fmadd_ps(b2, a1, c32);
}
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + 4);
c2 += v_load(c + ldc);
c3 += v_load(c + ldc + 4);
c4 += v_load(c + ldc*2);
c5 += v_load(c + ldc*2 + 4);
c6 += v_load(c + ldc*3);
c7 += v_load(c + ldc*3 + 4);
}
v_store(c, c0);
v_store(c + 4, c1);
v_store(c + ldc, c2);
v_store(c + ldc + 4, c3);
v_store(c + ldc * 2, c4);
v_store(c + ldc * 2 + 4, c5);
v_store(c + ldc * 3, c6);
v_store(c + ldc * 3 + 4, c7);
}
static void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
{
CV_Assert(CONV_NR >= 4);
v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
{
v_float32x4 a0 = v_setall_f32(a[0]);
v_float32x4 a1 = v_setall_f32(a[1]);
v_float32x4 a2 = v_setall_f32(a[2]);
v_float32x4 a3 = v_setall_f32(a[3]);
c00 = _mm256_add_ps(c00, _mm256_load_ps(c));
c01 = _mm256_add_ps(c01, _mm256_load_ps(c + 8));
c02 = _mm256_add_ps(c02, _mm256_load_ps(c + 16));
v_float32x4 b0 = v_load(b);
c10 = _mm256_add_ps(c10, _mm256_load_ps(c + ldc));
c11 = _mm256_add_ps(c11, _mm256_load_ps(c + ldc + 8));
c12 = _mm256_add_ps(c12, _mm256_load_ps(c + ldc + 16));
c0 = v_fma(b0, a0, c0);
c1 = v_fma(b0, a1, c1);
c2 = v_fma(b0, a2, c2);
c3 = v_fma(b0, a3, c3);
}
c20 = _mm256_add_ps(c20, _mm256_load_ps(c + ldc*2));
c21 = _mm256_add_ps(c21, _mm256_load_ps(c + ldc*2 + 8));
c22 = _mm256_add_ps(c22, _mm256_load_ps(c + ldc*2 + 16));
if (!init_c)
{
c0 += v_load(c);
c1 += v_load(c + ldc);
c2 += v_load(c + ldc*2);
c3 += v_load(c + ldc*3);
c30 = _mm256_add_ps(c30, _mm256_load_ps(c + ldc*3));
c31 = _mm256_add_ps(c31, _mm256_load_ps(c + ldc*3 + 8));
c32 = _mm256_add_ps(c32, _mm256_load_ps(c + ldc*3 + 16));
}
v_store(c, c0);
v_store(c + ldc, c1);
v_store(c + ldc * 2, c2);
v_store(c + ldc * 3, c3);
_mm256_storeu_ps(c, c00), _mm256_storeu_ps(c+8, c01), _mm256_storeu_ps(c+16, c02);
_mm256_storeu_ps(c + ldc, c10), _mm256_storeu_ps(c + ldc + 8, c11), _mm256_storeu_ps(c + ldc + 16, c12);
_mm256_storeu_ps(c + ldc*2, c20), _mm256_storeu_ps(c + ldc*2 + 8, c21), _mm256_storeu_ps(c + ldc*2 + 16, c22);
_mm256_storeu_ps(c + ldc*3, c30), _mm256_storeu_ps(c + ldc*3 + 8, c31), _mm256_storeu_ps(c + ldc*3 + 16, c32);
_mm256_zeroupper();
}
#endif
static void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen)
{
std::vector<float> cbuffer(CONV_MR * outLen, 0);
float* cbuf = cbuffer.data();
for( int p = 0; p < np; p++ )
{
for( int i = 0; i < CONV_MR; i++ )
{
float ai = a[CONV_MR*p + i];
for( int j = 0; j < outLen; j++ )
cbuf[i * outLen+j] += b[CONV_NR*p + j] * ai;
}
}
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
if (!init_c)
{
for(int i = 0; i < CONV_MR; i++)
{
for(int j = 0; j < outLen; j++)
c[i*ldc + j] += cbuf[i*outLen + j];
}
}
else
{
for(int i = 0; i < CONV_MR; i++)
{
for(int j = 0; j < outLen; j++)
c[i*ldc + j] = cbuf[i*outLen + j];
}
}
}
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen)
{
// The possible outLen range is [24, 8~1].
#if CV_SIMD128
#if CONV_MR == 4 && CONV_NR == 24
const int CONV_NRby3 = CONV_NR/3;
if (outLen > CONV_NRby3)
{
convBlock4x24(np, a, b, c, ldc, init_c);
return;
}
#endif
if (outLen <= 8 && outLen > 4)
{
convBlock4x8(np, a, b, c, ldc, init_c);
return;
}
if (outLen <= 4 && outLen > 1)
{
convBlock4x4(np, a, b, c, ldc, init_c);
return;
}
convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen);
#else
convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen);
#endif
}
} // namespace dnn
CV_CPU_OPTIMIZATION_NAMESPACE_END
// NEON code work around.
namespace opt_NEON
{
#if CV_TRY_NEON
void convBlock_NEON(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
{
#if CONV_MR == 4 && CONV_NR == 28 // AARCH64
#if CV_NEON_AARCH64
if (convMR == 4 && convNR == 28) // AARCH64
{
float32x4_t c00 = vdupq_n_f32(0.f), c01 = c00, c02 = c00, c03 = c00, c04 = c00, c05 = c00, c06 = c00;
float32x4_t c10 = vdupq_n_f32(0.f), c11 = c10, c12 = c10, c13 = c10, c14 = c10, c15 = c10, c16 = c10;
float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20;
float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30;
for( int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR )
for( int p = 0; p < np; p++, a += convMR, b += convNR )
{
float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
......@@ -499,18 +192,19 @@ void convBlock_NEON(int np, const float* a, const float* b, float* c, int ldc, b
vst1q_f32(c+ldc*3+16, c34); vst1q_f32(c+ldc*3+20, c35);
vst1q_f32(c+ldc*3+24, c36);
}
#elif CONV_MR == 4 && CONV_NR == 12 // ARMv7
else
#endif
if (convMR == 4 && convNR == 12) // ARMv7
{
float32x4_t c0 = vdupq_n_f32(0.f), c1 = c0, c2 = c0;
float32x4_t c3 = vdupq_n_f32(0.f), c4 = c3, c5 = c3;
float32x4_t c6 = vdupq_n_f32(0.f), c7 = c6, c8 = c6;
float32x4_t c9 = vdupq_n_f32(0.f), c10 = c9, c11 = c9;
float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0;
float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
for (int p = 0; p < np; p++, a += convMR, b += convNR)
{
a0 = vld1_f32(a), a1 = vld1_f32(a+2);
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
......@@ -556,12 +250,10 @@ void convBlock_NEON(int np, const float* a, const float* b, float* c, int ldc, b
vst1q_f32(c + ldc*2, c6), vst1q_f32(c + ldc*2 + 4, c7), vst1q_f32(c + ldc*2 + 8, c8);
vst1q_f32(c + ldc*3, c9), vst1q_f32(c + ldc*3 + 4, c10), vst1q_f32(c + ldc*3 + 8, c11);
}
//#else
//#error "unsupported CONV_MR and/or CONV_NR in convBlock_NEON."
#endif
else
CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock");
}
#endif
} // namespace opt_NEON
} // namespace cv
#endif //OPENCV_FAST_CONVOLUTION_SIMD_HPP
#endif
}
}} // namespace cv::dnn
......@@ -2,20 +2,147 @@
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
// Here is the original license:
/*
This file is a part of ficus language project.
See ficus/LICENSE for the licensing terms
*/
#include "../../precomp.hpp"
#include "fast_convolution.hpp"
#include "../layers_common.hpp"
#include "convolution.hpp"
#include "conv_depthwise.simd.hpp"
#include "layers/cpu_kernels/conv_depthwise.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace cv { namespace dnn {
static void depthWiseBlockConv2D(const float* wptr,
void depthWiseBlockConv2D(const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_,
int height, int width,
float* outptr_,
int out_d, int outH, int outW, bool fusedAdd);
void depthWiseBlockConv1D(const float* wptr,
int kernel_w, int stride_w, int dilation_w, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_, int width,
float* outptr_,
int out_d, int outW, bool fusedAdd);
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ_,
const std::vector<float>& reluslope, bool fusedAdd)
{
Mat input = _input.getMat();
Mat output = _output.getMat();
MatShape inputShape = shape(input);
MatShape outputShape = shape(output);
CV_Assert(inputShape.size() == 3 || inputShape.size() == 4);
CV_Assert(inputShape.size() == outputShape.size());
int conv_dim = conv->conv_dim;
CV_Assert((conv_dim == CONV_2D || conv_dim == CONV_1D) &&
"DNN: Currently we do not support depth-wise for Convolution 3D!");
ActivationLayer* activ = reluslope.empty() ? activ_ : nullptr;
int N = inputShape[0], C = inputShape[1];
int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
int Wi = inputShape[inputShape.size() - 1];
int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
int W0 = outputShape[outputShape.size() - 1];
int ngroups = conv->ngroups;
const size_t inp_planesize = (size_t) Hi * Wi;
const size_t out_planesize = (size_t) H0 * W0;
CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);
int stride_h = conv->stride_h, stride_w = conv->stride_w;
int dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
int pad_left = conv->pad_left, pad_right = conv->pad_right;
int ksize = Hk * Wk;
const int VEC_NLANES = 32;
int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES;
const float *inp = input.ptr<float>();
float *out = output.ptr<float>();
#if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV
// TODO: remove the following limitation, need change code in conv_depthwise.simd.hpp.
bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1) && !fusedAdd;
#endif
std::vector<int> ofstab_(3 * ksize, 0);
int *ofstab = ofstab_.data();
int *yxtab = ofstab + ksize;
for (int k = 0; k < ksize; k++)
{
int y = k < ksize ? k / Wk : 0;
int x = k < ksize ? k % Wk : 0;
int dy = y * dilation_h, dx = x * dilation_w;
yxtab[k * 2] = dy;
yxtab[k * 2 + 1] = dx;
ofstab[k] = dy * Wi + dx;
}
const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
const float* relu = reluslope.data();
CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
parallel_for_(Range(0, N * C), [&](const Range &r0) {
for (int nc = r0.start; nc < r0.end; nc++)
{
int c = nc % C;
const float *inptr0 = inp + inp_planesize * nc;
float *outptr0 = out + out_planesize * nc;
const float *weights = weights0 + c * padded_ksize;
if (conv_dim == CONV_2D)
{
#if CV_TRY_AVX2
if(canRunOpt && conv->useAVX2)
opt_AVX2::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
#if CV_TRY_AVX
if(canRunOpt && conv->useAVX)
opt_AVX::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
#if CV_TRY_RVV
if(canRunOpt && conv->useRVV)
opt_RVV::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0, fusedAdd);
}
else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D.
{
depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0, fusedAdd);
}
if (activ)
activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
}});
}
/****************************************************************************************\
SIMD and no-SIMD code for depthWiseBlockConv
\****************************************************************************************/
void depthWiseBlockConv2D(const float* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
......@@ -199,7 +326,7 @@ static void depthWiseBlockConv2D(const float* wptr,
}
}
static void depthWiseBlockConv1D(const float* wptr,
void depthWiseBlockConv1D(const float* wptr,
int kernel_w, int stride_w, int dilation_w, int pad_l,
const float* biasptr, const float* relu,
const float* inptr_, int width,
......@@ -332,114 +459,5 @@ static void depthWiseBlockConv1D(const float* wptr,
}
}
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ_,
const std::vector<float>& reluslope, bool fusedAdd)
{
Mat input = _input.getMat();
Mat output = _output.getMat();
MatShape inputShape = shape(input);
MatShape outputShape = shape(output);
CV_Assert(inputShape.size() == 3 || inputShape.size() == 4);
CV_Assert(inputShape.size() == outputShape.size());
int conv_dim = conv->conv_dim;
CV_Assert((conv_dim == CONV_2D || conv_dim == CONV_1D) &&
"DNN: Currently we do not support depth-wise for Convolution 3D!");
ActivationLayer* activ = reluslope.empty() ? activ_ : nullptr;
int N = inputShape[0], C = inputShape[1];
int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
int Wi = inputShape[inputShape.size() - 1];
int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
int W0 = outputShape[outputShape.size() - 1];
int ngroups = conv->ngroups;
const size_t inp_planesize = (size_t) Hi * Wi;
const size_t out_planesize = (size_t) H0 * W0;
CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);
int stride_h = conv->stride_h, stride_w = conv->stride_w;
int dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
int pad_left = conv->pad_left, pad_right = conv->pad_right;
int ksize = Hk * Wk;
const int VEC_NLANES = 32;
int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES;
const float *inp = input.ptr<float>();
float *out = output.ptr<float>();
#if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV
// TODO: remove the following limitation, need change code in layers_common.simd.hpp.
bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1) && !fusedAdd;
#endif
std::vector<int> ofstab_(3 * ksize, 0);
int *ofstab = ofstab_.data();
int *yxtab = ofstab + ksize;
for (int k = 0; k < ksize; k++)
{
int y = k < ksize ? k / Wk : 0;
int x = k < ksize ? k % Wk : 0;
int dy = y * dilation_h, dx = x * dilation_w;
yxtab[k * 2] = dy;
yxtab[k * 2 + 1] = dx;
ofstab[k] = dy * Wi + dx;
}
const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
const float* relu = reluslope.data();
CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
parallel_for_(Range(0, N * C), [&](const Range &r0) {
for (int nc = r0.start; nc < r0.end; nc++)
{
int c = nc % C;
const float *inptr0 = inp + inp_planesize * nc;
float *outptr0 = out + out_planesize * nc;
const float *weights = weights0 + c * padded_ksize;
if (conv_dim == CONV_2D)
{
#if CV_TRY_AVX2
if(canRunOpt && conv->useAVX2)
opt_AVX2::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
#if CV_TRY_AVX
if(canRunOpt && conv->useAVX)
opt_AVX::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
#if CV_TRY_RVV
if(canRunOpt && conv->useRVV)
opt_RVV::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0, fusedAdd);
}
else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D.
{
depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0, fusedAdd);
}
if (activ)
activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
}});
}
}} // namespace cv::dnn
......@@ -22,27 +22,29 @@
// Winograd Params
enum {
_FX_WINO_STEP=6,
_FX_WINO_KSIZE=3,
_FX_WINO_SIZE=_FX_WINO_STEP+_FX_WINO_KSIZE-1,
_FX_WINO_AREA=_FX_WINO_SIZE*_FX_WINO_SIZE,
CONV_WINO_STEP=6,
CONV_WINO_KSIZE=3,
CONV_WINO_SIZE=CONV_WINO_STEP+CONV_WINO_KSIZE-1, // 8
CONV_WINO_AREA=CONV_WINO_SIZE*CONV_WINO_SIZE,
_FX_WINO_KBLOCK = 4,
CONV_WINO_KBLOCK = 4,
#if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX2
_FX_WINO_IBLOCK = 6,
CONV_WINO_IBLOCK = 6,
#else
_FX_WINO_IBLOCK = 3,
CONV_WINO_IBLOCK = 3,
#endif
#if CV_TRY_AVX2
_FX_WINO_ATOM_F32 = 8,
CONV_WINO_ATOM_F32 = 8,
#else
_FX_WINO_ATOM_F32 = 4,
CONV_WINO_ATOM_F32 = 4,
#endif
_FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
};
enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2, _FX_CONV_TYPE_DEPTHWISE_REMAIN=3 };
// NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN.
enum { CONV_TYPE_GENERIC=0, CONV_TYPE_DEPTHWISE=1, CONV_TYPE_WINOGRAD3X3=2, CONV_TYPE_DEPTHWISE_REMAIN=3 };
enum { CONV_1D = 0, CONV_2D = 1, CONV_3D = 2 };
#endif
......@@ -105,22 +107,6 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& c
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
namespace opt_AVX2
{
#if CV_TRY_AVX2
void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c);
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, const float minval,
const float maxval, bool ifMinMaxAct);
void _fx_winograd_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock);
void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg);
void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct);
#endif
} // namespace opt_AVX2
} // namespace dnn
} // namespace cv
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册