提交 6b885884 编写于 作者: L lixian

fix kernel parameter for int8 conv kernel

上级 d46a0e4d
......@@ -127,102 +127,102 @@ ConvDwInt8Center:
subs x20, x20, #1
bne LoopKh16
sqshl v0.4s, v0.4s ,v26.4s
sqshl v1.4s, v1.4s ,v26.4s
sqshl v2.4s, v2.4s ,v26.4s
sqshl v3.4s, v3.4s ,v26.4s
sqshl v4.4s, v4.4s ,v26.4s
sqshl v5.4s, v5.4s ,v26.4s
sqshl v6.4s, v6.4s ,v26.4s
sqshl v7.4s, v7.4s ,v26.4s
sqshl v8.4s, v8.4s ,v26.4s
sqshl v9.4s, v9.4s ,v26.4s
sqshl v10.4s, v10.4s ,v26.4s
sqshl v11.4s, v11.4s ,v26.4s
sqshl v12.4s, v12.4s ,v26.4s
sqshl v13.4s, v13.4s ,v26.4s
sqshl v14.4s, v14.4s ,v26.4s
sqshl v15.4s, v15.4s ,v26.4s
sqrdmulh v0.4s, v0.4s ,v27.4s
sqrdmulh v1.4s, v1.4s ,v27.4s
sqrdmulh v2.4s, v2.4s ,v27.4s
sqrdmulh v3.4s, v3.4s ,v27.4s
sqrdmulh v4.4s, v4.4s ,v27.4s
sqrdmulh v5.4s, v5.4s ,v27.4s
sqrdmulh v6.4s, v6.4s ,v27.4s
sqrdmulh v7.4s, v7.4s ,v27.4s
sqrdmulh v8.4s, v8.4s ,v27.4s
sqrdmulh v9.4s, v9.4s ,v27.4s
sqrdmulh v10.4s, v10.4s ,v27.4s
sqrdmulh v11.4s, v11.4s ,v27.4s
sqrdmulh v12.4s, v12.4s ,v27.4s
sqrdmulh v13.4s, v13.4s ,v27.4s
sqrdmulh v14.4s, v14.4s ,v27.4s
sqrdmulh v15.4s, v15.4s ,v27.4s
sqrshl v0.4s, v0.4s ,v28.4s
sqrshl v1.4s, v1.4s ,v28.4s
sqrshl v2.4s, v2.4s ,v28.4s
sqrshl v3.4s, v3.4s ,v28.4s
sqrshl v4.4s, v4.4s ,v28.4s
sqrshl v5.4s, v5.4s ,v28.4s
sqrshl v6.4s, v6.4s ,v28.4s
sqrshl v7.4s, v7.4s ,v28.4s
sqrshl v8.4s, v8.4s ,v28.4s
sqrshl v9.4s, v9.4s ,v28.4s
sqrshl v10.4s, v10.4s ,v28.4s
sqrshl v11.4s, v11.4s ,v28.4s
sqrshl v12.4s, v12.4s ,v28.4s
sqrshl v13.4s, v13.4s ,v28.4s
sqrshl v14.4s, v14.4s ,v28.4s
sqrshl v15.4s, v15.4s ,v28.4s
add v0.4s, v0.4s ,v29.4s
add v1.4s, v1.4s ,v29.4s
add v2.4s, v2.4s ,v29.4s
add v3.4s, v3.4s ,v29.4s
add v4.4s, v4.4s ,v29.4s
add v5.4s, v5.4s ,v29.4s
add v6.4s, v6.4s ,v29.4s
add v7.4s, v7.4s ,v29.4s
add v8.4s, v8.4s ,v29.4s
add v9.4s, v9.4s ,v29.4s
add v10.4s, v10.4s ,v29.4s
add v11.4s, v11.4s ,v29.4s
add v12.4s, v12.4s ,v29.4s
add v13.4s, v13.4s ,v29.4s
add v14.4s, v14.4s ,v29.4s
add v15.4s, v15.4s ,v29.4s
smax v0.4s, v0.4s ,v30.4s
smax v1.4s, v1.4s ,v30.4s
smax v2.4s, v2.4s ,v30.4s
smax v3.4s, v3.4s ,v30.4s
smax v4.4s, v4.4s ,v30.4s
smax v5.4s, v5.4s ,v30.4s
smax v6.4s, v6.4s ,v30.4s
smax v7.4s, v7.4s ,v30.4s
smax v8.4s, v8.4s ,v30.4s
smax v9.4s, v9.4s ,v30.4s
smax v10.4s, v10.4s ,v30.4s
smax v11.4s, v11.4s ,v30.4s
smax v12.4s, v12.4s ,v30.4s
smax v13.4s, v13.4s ,v30.4s
smax v14.4s, v14.4s ,v30.4s
smax v15.4s, v15.4s ,v30.4s
smin v0.4s, v0.4s ,v31.4s
smin v1.4s, v1.4s ,v31.4s
smin v2.4s, v2.4s ,v31.4s
smin v3.4s, v3.4s ,v31.4s
smin v4.4s, v4.4s ,v31.4s
smin v5.4s, v5.4s ,v31.4s
smin v6.4s, v6.4s ,v31.4s
smin v7.4s, v7.4s ,v31.4s
smin v8.4s, v8.4s ,v31.4s
smin v9.4s, v9.4s ,v31.4s
smin v10.4s, v10.4s ,v31.4s
smin v11.4s, v11.4s ,v31.4s
smin v12.4s, v12.4s ,v31.4s
smin v13.4s, v13.4s ,v31.4s
smin v14.4s, v14.4s ,v31.4s
smin v15.4s, v15.4s ,v31.4s
sqshl v0.4s, v0.4s, v26.4s
sqshl v1.4s, v1.4s, v26.4s
sqshl v2.4s, v2.4s, v26.4s
sqshl v3.4s, v3.4s, v26.4s
sqshl v4.4s, v4.4s, v26.4s
sqshl v5.4s, v5.4s, v26.4s
sqshl v6.4s, v6.4s, v26.4s
sqshl v7.4s, v7.4s, v26.4s
sqshl v8.4s, v8.4s, v26.4s
sqshl v9.4s, v9.4s, v26.4s
sqshl v10.4s, v10.4s, v26.4s
sqshl v11.4s, v11.4s, v26.4s
sqshl v12.4s, v12.4s, v26.4s
sqshl v13.4s, v13.4s, v26.4s
sqshl v14.4s, v14.4s, v26.4s
sqshl v15.4s, v15.4s, v26.4s
sqrdmulh v0.4s, v0.4s, v27.4s
sqrdmulh v1.4s, v1.4s, v27.4s
sqrdmulh v2.4s, v2.4s, v27.4s
sqrdmulh v3.4s, v3.4s, v27.4s
sqrdmulh v4.4s, v4.4s, v27.4s
sqrdmulh v5.4s, v5.4s, v27.4s
sqrdmulh v6.4s, v6.4s, v27.4s
sqrdmulh v7.4s, v7.4s, v27.4s
sqrdmulh v8.4s, v8.4s, v27.4s
sqrdmulh v9.4s, v9.4s, v27.4s
sqrdmulh v10.4s, v10.4s, v27.4s
sqrdmulh v11.4s, v11.4s, v27.4s
sqrdmulh v12.4s, v12.4s, v27.4s
sqrdmulh v13.4s, v13.4s, v27.4s
sqrdmulh v14.4s, v14.4s, v27.4s
sqrdmulh v15.4s, v15.4s, v27.4s
sqrshl v0.4s, v0.4s, v28.4s
sqrshl v1.4s, v1.4s, v28.4s
sqrshl v2.4s, v2.4s, v28.4s
sqrshl v3.4s, v3.4s, v28.4s
sqrshl v4.4s, v4.4s, v28.4s
sqrshl v5.4s, v5.4s, v28.4s
sqrshl v6.4s, v6.4s, v28.4s
sqrshl v7.4s, v7.4s, v28.4s
sqrshl v8.4s, v8.4s, v28.4s
sqrshl v9.4s, v9.4s, v28.4s
sqrshl v10.4s, v10.4s, v28.4s
sqrshl v11.4s, v11.4s, v28.4s
sqrshl v12.4s, v12.4s, v28.4s
sqrshl v13.4s, v13.4s, v28.4s
sqrshl v14.4s, v14.4s, v28.4s
sqrshl v15.4s, v15.4s, v28.4s
add v0.4s, v0.4s, v29.4s
add v1.4s, v1.4s, v29.4s
add v2.4s, v2.4s, v29.4s
add v3.4s, v3.4s, v29.4s
add v4.4s, v4.4s, v29.4s
add v5.4s, v5.4s, v29.4s
add v6.4s, v6.4s, v29.4s
add v7.4s, v7.4s, v29.4s
add v8.4s, v8.4s, v29.4s
add v9.4s, v9.4s, v29.4s
add v10.4s, v10.4s, v29.4s
add v11.4s, v11.4s, v29.4s
add v12.4s, v12.4s, v29.4s
add v13.4s, v13.4s, v29.4s
add v14.4s, v14.4s, v29.4s
add v15.4s, v15.4s, v29.4s
smax v0.4s, v0.4s, v30.4s
smax v1.4s, v1.4s, v30.4s
smax v2.4s, v2.4s, v30.4s
smax v3.4s, v3.4s, v30.4s
smax v4.4s, v4.4s, v30.4s
smax v5.4s, v5.4s, v30.4s
smax v6.4s, v6.4s, v30.4s
smax v7.4s, v7.4s, v30.4s
smax v8.4s, v8.4s, v30.4s
smax v9.4s, v9.4s, v30.4s
smax v10.4s, v10.4s, v30.4s
smax v11.4s, v11.4s, v30.4s
smax v12.4s, v12.4s, v30.4s
smax v13.4s, v13.4s, v30.4s
smax v14.4s, v14.4s, v30.4s
smax v15.4s, v15.4s, v30.4s
smin v0.4s, v0.4s, v31.4s
smin v1.4s, v1.4s, v31.4s
smin v2.4s, v2.4s, v31.4s
smin v3.4s, v3.4s, v31.4s
smin v4.4s, v4.4s, v31.4s
smin v5.4s, v5.4s, v31.4s
smin v6.4s, v6.4s, v31.4s
smin v7.4s, v7.4s, v31.4s
smin v8.4s, v8.4s, v31.4s
smin v9.4s, v9.4s, v31.4s
smin v10.4s, v10.4s, v31.4s
smin v11.4s, v11.4s, v31.4s
smin v12.4s, v12.4s, v31.4s
smin v13.4s, v13.4s, v31.4s
smin v14.4s, v14.4s, v31.4s
smin v15.4s, v15.4s, v31.4s
sqxtn v0.4h, v0.4s
sqxtn v1.4h, v1.4s
......@@ -391,54 +391,54 @@ ConvDwInt8Center:
subs x20, x20, #1
bne LoopKh8
sqshl v0.4s, v0.4s ,v26.4s
sqshl v1.4s, v1.4s ,v26.4s
sqshl v2.4s, v2.4s ,v26.4s
sqshl v3.4s, v3.4s ,v26.4s
sqshl v4.4s, v4.4s ,v26.4s
sqshl v5.4s, v5.4s ,v26.4s
sqshl v6.4s, v6.4s ,v26.4s
sqshl v7.4s, v7.4s ,v26.4s
sqrdmulh v0.4s, v0.4s ,v27.4s
sqrdmulh v1.4s, v1.4s ,v27.4s
sqrdmulh v2.4s, v2.4s ,v27.4s
sqrdmulh v3.4s, v3.4s ,v27.4s
sqrdmulh v4.4s, v4.4s ,v27.4s
sqrdmulh v5.4s, v5.4s ,v27.4s
sqrdmulh v6.4s, v6.4s ,v27.4s
sqrdmulh v7.4s, v7.4s ,v27.4s
sqrshl v0.4s, v0.4s ,v28.4s
sqrshl v1.4s, v1.4s ,v28.4s
sqrshl v2.4s, v2.4s ,v28.4s
sqrshl v3.4s, v3.4s ,v28.4s
sqrshl v4.4s, v4.4s ,v28.4s
sqrshl v5.4s, v5.4s ,v28.4s
sqrshl v6.4s, v6.4s ,v28.4s
sqrshl v7.4s, v7.4s ,v28.4s
add v0.4s, v0.4s ,v29.4s
add v1.4s, v1.4s ,v29.4s
add v2.4s, v2.4s ,v29.4s
add v3.4s, v3.4s ,v29.4s
add v4.4s, v4.4s ,v29.4s
add v5.4s, v5.4s ,v29.4s
add v6.4s, v6.4s ,v29.4s
add v7.4s, v7.4s ,v29.4s
smax v0.4s, v0.4s ,v30.4s
smax v1.4s, v1.4s ,v30.4s
smax v2.4s, v2.4s ,v30.4s
smax v3.4s, v3.4s ,v30.4s
smax v4.4s, v4.4s ,v30.4s
smax v5.4s, v5.4s ,v30.4s
smax v6.4s, v6.4s ,v30.4s
smax v7.4s, v7.4s ,v30.4s
smin v0.4s, v0.4s ,v31.4s
smin v1.4s, v1.4s ,v31.4s
smin v2.4s, v2.4s ,v31.4s
smin v3.4s, v3.4s ,v31.4s
smin v4.4s, v4.4s ,v31.4s
smin v5.4s, v5.4s ,v31.4s
smin v6.4s, v6.4s ,v31.4s
smin v7.4s, v7.4s ,v31.4s
sqshl v0.4s, v0.4s, v26.4s
sqshl v1.4s, v1.4s, v26.4s
sqshl v2.4s, v2.4s, v26.4s
sqshl v3.4s, v3.4s, v26.4s
sqshl v4.4s, v4.4s, v26.4s
sqshl v5.4s, v5.4s, v26.4s
sqshl v6.4s, v6.4s, v26.4s
sqshl v7.4s, v7.4s, v26.4s
sqrdmulh v0.4s, v0.4s, v27.4s
sqrdmulh v1.4s, v1.4s, v27.4s
sqrdmulh v2.4s, v2.4s, v27.4s
sqrdmulh v3.4s, v3.4s, v27.4s
sqrdmulh v4.4s, v4.4s, v27.4s
sqrdmulh v5.4s, v5.4s, v27.4s
sqrdmulh v6.4s, v6.4s, v27.4s
sqrdmulh v7.4s, v7.4s, v27.4s
sqrshl v0.4s, v0.4s, v28.4s
sqrshl v1.4s, v1.4s, v28.4s
sqrshl v2.4s, v2.4s, v28.4s
sqrshl v3.4s, v3.4s, v28.4s
sqrshl v4.4s, v4.4s, v28.4s
sqrshl v5.4s, v5.4s, v28.4s
sqrshl v6.4s, v6.4s, v28.4s
sqrshl v7.4s, v7.4s, v28.4s
add v0.4s, v0.4s, v29.4s
add v1.4s, v1.4s, v29.4s
add v2.4s, v2.4s, v29.4s
add v3.4s, v3.4s, v29.4s
add v4.4s, v4.4s, v29.4s
add v5.4s, v5.4s, v29.4s
add v6.4s, v6.4s, v29.4s
add v7.4s, v7.4s, v29.4s
smax v0.4s, v0.4s, v30.4s
smax v1.4s, v1.4s, v30.4s
smax v2.4s, v2.4s, v30.4s
smax v3.4s, v3.4s, v30.4s
smax v4.4s, v4.4s, v30.4s
smax v5.4s, v5.4s, v30.4s
smax v6.4s, v6.4s, v30.4s
smax v7.4s, v7.4s, v30.4s
smin v0.4s, v0.4s, v31.4s
smin v1.4s, v1.4s, v31.4s
smin v2.4s, v2.4s, v31.4s
smin v3.4s, v3.4s, v31.4s
smin v4.4s, v4.4s, v31.4s
smin v5.4s, v5.4s, v31.4s
smin v6.4s, v6.4s, v31.4s
smin v7.4s, v7.4s, v31.4s
sqxtn v0.4h, v0.4s
sqxtn v1.4h, v1.4s
......@@ -524,12 +524,12 @@ ConvDwInt8Center:
subs x20, x20, #1
bne LoopKh
sqshl v0.4s, v0.4s ,v26.4s
sqrdmulh v0.4s, v0.4s ,v27.4s
sqrshl v0.4s, v0.4s ,v28.4s
add v0.4s, v0.4s ,v29.4s
smax v0.4s, v0.4s ,v30.4s
smin v0.4s, v0.4s ,v31.4s
sqshl v0.4s, v0.4s, v26.4s
sqrdmulh v0.4s, v0.4s, v27.4s
sqrshl v0.4s, v0.4s, v28.4s
add v0.4s, v0.4s, v29.4s
smax v0.4s, v0.4s, v30.4s
smin v0.4s, v0.4s, v31.4s
sqxtn v0.4h, v0.4s
sqxtn v0.8b, v0.8h
......
......@@ -268,40 +268,40 @@ IndirectGemmStart:
Relu6:
movi v1.4s, #6
scvtf v1.4s, v1.4s
fmin v16.4s, v16.4s ,v1.4s
fmin v17.4s, v17.4s ,v1.4s
fmin v18.4s, v18.4s ,v1.4s
fmin v19.4s, v19.4s ,v1.4s
fmin v20.4s, v20.4s ,v1.4s
fmin v21.4s, v21.4s ,v1.4s
fmin v22.4s, v22.4s ,v1.4s
fmin v23.4s, v23.4s ,v1.4s
fmin v24.4s, v24.4s ,v1.4s
fmin v25.4s, v25.4s ,v1.4s
fmin v26.4s, v26.4s ,v1.4s
fmin v27.4s, v27.4s ,v1.4s
fmin v28.4s, v28.4s ,v1.4s
fmin v29.4s, v29.4s ,v1.4s
fmin v30.4s, v30.4s ,v1.4s
fmin v31.4s, v31.4s ,v1.4s
fmin v16.4s, v16.4s, v1.4s
fmin v17.4s, v17.4s, v1.4s
fmin v18.4s, v18.4s, v1.4s
fmin v19.4s, v19.4s, v1.4s
fmin v20.4s, v20.4s, v1.4s
fmin v21.4s, v21.4s, v1.4s
fmin v22.4s, v22.4s, v1.4s
fmin v23.4s, v23.4s, v1.4s
fmin v24.4s, v24.4s, v1.4s
fmin v25.4s, v25.4s, v1.4s
fmin v26.4s, v26.4s, v1.4s
fmin v27.4s, v27.4s, v1.4s
fmin v28.4s, v28.4s, v1.4s
fmin v29.4s, v29.4s, v1.4s
fmin v30.4s, v30.4s, v1.4s
fmin v31.4s, v31.4s, v1.4s
Relu:
dup v0.4s, wzr
fmax v16.4s, v16.4s ,v0.4s
fmax v17.4s, v17.4s ,v0.4s
fmax v18.4s, v18.4s ,v0.4s
fmax v19.4s, v19.4s ,v0.4s
fmax v20.4s, v20.4s ,v0.4s
fmax v21.4s, v21.4s ,v0.4s
fmax v22.4s, v22.4s ,v0.4s
fmax v23.4s, v23.4s ,v0.4s
fmax v24.4s, v24.4s ,v0.4s
fmax v25.4s, v25.4s ,v0.4s
fmax v26.4s, v26.4s ,v0.4s
fmax v27.4s, v27.4s ,v0.4s
fmax v28.4s, v28.4s ,v0.4s
fmax v29.4s, v29.4s ,v0.4s
fmax v30.4s, v30.4s ,v0.4s
fmax v31.4s, v31.4s ,v0.4s
fmax v16.4s, v16.4s, v0.4s
fmax v17.4s, v17.4s, v0.4s
fmax v18.4s, v18.4s, v0.4s
fmax v19.4s, v19.4s, v0.4s
fmax v20.4s, v20.4s, v0.4s
fmax v21.4s, v21.4s, v0.4s
fmax v22.4s, v22.4s, v0.4s
fmax v23.4s, v23.4s, v0.4s
fmax v24.4s, v24.4s, v0.4s
fmax v25.4s, v25.4s, v0.4s
fmax v26.4s, v26.4s, v0.4s
fmax v27.4s, v27.4s, v0.4s
fmax v28.4s, v28.4s, v0.4s
fmax v29.4s, v29.4s, v0.4s
fmax v30.4s, v30.4s, v0.4s
fmax v31.4s, v31.4s, v0.4s
WriteStart:
cbnz x9, WriteC4
......@@ -595,24 +595,24 @@ IndirectGemmStart:
Relu6Half:
movi v1.4s, #6
scvtf v1.4s, v1.4s
fmin v16.4s, v16.4s ,v1.4s
fmin v18.4s, v18.4s ,v1.4s
fmin v20.4s, v20.4s ,v1.4s
fmin v22.4s, v22.4s ,v1.4s
fmin v24.4s, v24.4s ,v1.4s
fmin v26.4s, v26.4s ,v1.4s
fmin v28.4s, v28.4s ,v1.4s
fmin v30.4s, v30.4s ,v1.4s
fmin v16.4s, v16.4s, v1.4s
fmin v18.4s, v18.4s, v1.4s
fmin v20.4s, v20.4s, v1.4s
fmin v22.4s, v22.4s, v1.4s
fmin v24.4s, v24.4s, v1.4s
fmin v26.4s, v26.4s, v1.4s
fmin v28.4s, v28.4s, v1.4s
fmin v30.4s, v30.4s, v1.4s
ReluHalf:
dup v0.4s, wzr
fmax v16.4s, v16.4s ,v0.4s
fmax v18.4s, v18.4s ,v0.4s
fmax v20.4s, v20.4s ,v0.4s
fmax v22.4s, v22.4s ,v0.4s
fmax v24.4s, v24.4s ,v0.4s
fmax v26.4s, v26.4s ,v0.4s
fmax v28.4s, v28.4s ,v0.4s
fmax v30.4s, v30.4s ,v0.4s
fmax v16.4s, v16.4s, v0.4s
fmax v18.4s, v18.4s, v0.4s
fmax v20.4s, v20.4s, v0.4s
fmax v22.4s, v22.4s, v0.4s
fmax v24.4s, v24.4s, v0.4s
fmax v26.4s, v26.4s, v0.4s
fmax v28.4s, v28.4s, v0.4s
fmax v30.4s, v30.4s, v0.4s
WriteStartHalf:
cbnz x9, Write4
......
......@@ -87,14 +87,15 @@ IndirectGemmInt8_4x4:
ld1 {v2.16b, v3.16b}, [x12], #32
smull v10.8h, v0.8b, v6.8b
smull v11.8h, v0.8b, v7.8b
saddlp v16.4s, v8.8h
smlal2 v10.8h, v0.16b, v6.16b
smlal2 v11.8h, v0.16b, v7.16b
saddlp v16.4s, v8.8h
saddlp v17.4s, v9.8h
smull v14.8h, v1.8b, v6.8b
smull v15.8h, v1.8b, v7.8b
saddlp v18.4s, v10.8h
smlal2 v14.8h, v1.16b, v6.16b
smlal2 v15.8h, v1.16b, v7.16b
saddlp v17.4s, v9.8h
subs x13, x5, #1
beq LoopIcEnd
......@@ -102,55 +103,55 @@ IndirectGemmInt8_4x4:
LoopIc:
// load input for output 1-8
ld1 {v0.16b, v1.16b}, [x12], #32
sadalp v18.4s, v10.8h
sadalp v19.4s, v11.8h
smull v8.8h, v2.8b, v4.8b
smull v9.8h, v2.8b, v5.8b
sadalp v19.4s, v11.8h
sadalp v20.4s, v12.8h
smlal2 v8.8h, v2.16b, v4.16b
smlal2 v9.8h, v2.16b, v5.16b
sadalp v20.4s, v12.8h
sadalp v21.4s, v13.8h
smull v10.8h, v2.8b, v6.8b
smull v11.8h, v2.8b, v7.8b
sadalp v21.4s, v13.8h
sadalp v22.4s, v14.8h
smlal2 v10.8h, v2.16b, v6.16b
smlal2 v11.8h, v2.16b, v7.16b
sadalp v22.4s, v14.8h
sadalp v23.4s, v15.8h
smull v12.8h, v3.8b, v4.8b
smull v13.8h, v3.8b, v5.8b
sadalp v23.4s, v15.8h
sadalp v24.4s, v8.8h
smlal2 v12.8h, v3.16b, v4.16b
smlal2 v13.8h, v3.16b, v5.16b
sadalp v24.4s, v8.8h
ld1 {v4.16b, v5.16b}, [x2], #32
sadalp v25.4s, v9.8h
smull v14.8h, v3.8b, v6.8b
smull v15.8h, v3.8b, v7.8b
sadalp v25.4s, v9.8h
sadalp v26.4s, v10.8h
smlal2 v14.8h, v3.16b, v6.16b
smlal2 v15.8h, v3.16b, v7.16b
sadalp v26.4s, v10.8h
ld1 {v6.16b, v7.16b}, [x2], #32
sadalp v27.4s, v11.8h
smull v8.8h, v0.8b, v4.8b
smull v9.8h, v0.8b, v5.8b
sadalp v27.4s, v11.8h
sadalp v28.4s, v12.8h
smlal2 v8.8h, v0.16b, v4.16b
smlal2 v9.8h, v0.16b, v5.16b
sadalp v28.4s, v12.8h
ld1 {v2.16b, v3.16b}, [x12], #32
sadalp v29.4s, v13.8h
smull v12.8h, v1.8b, v4.8b
smull v13.8h, v1.8b, v5.8b
sadalp v29.4s, v13.8h
sadalp v30.4s, v14.8h
smlal2 v12.8h, v1.16b, v4.16b
smlal2 v13.8h, v1.16b, v5.16b
sadalp v30.4s, v14.8h
sadalp v31.4s, v15.8h
smull v10.8h, v0.8b, v6.8b
smull v11.8h, v0.8b, v7.8b
sadalp v31.4s, v15.8h
sadalp v16.4s, v8.8h
smlal2 v10.8h, v0.16b, v6.16b
smlal2 v11.8h, v0.16b, v7.16b
sadalp v16.4s, v8.8h
sadalp v17.4s, v9.8h
smull v14.8h, v1.8b, v6.8b
smull v15.8h, v1.8b, v7.8b
sadalp v17.4s, v9.8h
saddlp v18.4s, v10.8h
smlal2 v14.8h, v1.16b, v6.16b
smlal2 v15.8h, v1.16b, v7.16b
......@@ -158,33 +159,32 @@ IndirectGemmInt8_4x4:
bne LoopIc
LoopIcEnd:
sadalp v18.4s, v10.8h
sadalp v19.4s, v11.8h
smull v8.8h, v2.8b, v4.8b
smull v9.8h, v2.8b, v5.8b
sadalp v19.4s, v11.8h
sadalp v20.4s, v12.8h
smlal2 v8.8h, v2.16b, v4.16b
smlal2 v9.8h, v2.16b, v5.16b
sadalp v20.4s, v12.8h
sadalp v21.4s, v13.8h
smull v10.8h, v2.8b, v6.8b
smull v11.8h, v2.8b, v7.8b
sadalp v21.4s, v13.8h
sadalp v22.4s, v14.8h
smlal2 v10.8h, v2.16b, v6.16b
smlal2 v11.8h, v2.16b, v7.16b
sadalp v22.4s, v14.8h
sadalp v23.4s, v15.8h
smull v12.8h, v3.8b, v4.8b
smull v13.8h, v3.8b, v5.8b
sadalp v23.4s, v15.8h
sadalp v24.4s, v8.8h
smlal2 v12.8h, v3.16b, v4.16b
smlal2 v13.8h, v3.16b, v5.16b
sadalp v24.4s, v8.8h
sadalp v25.4s, v9.8h
smull v14.8h, v3.8b, v6.8b
smull v15.8h, v3.8b, v7.8b
sadalp v25.4s, v9.8h
sadalp v26.4s, v10.8h
smlal2 v14.8h, v3.16b, v6.16b
smlal2 v15.8h, v3.16b, v7.16b
sadalp v26.4s, v10.8h
sadalp v27.4s, v11.8h
sadalp v28.4s ,v12.8h
sadalp v28.4s, v12.8h
sadalp v29.4s, v13.8h
sadalp v30.4s, v14.8h
sadalp v31.4s, v15.8h
......@@ -204,6 +204,7 @@ IndirectGemmInt8_4x4:
addp v26.4s, v26.4s, v27.4s
addp v28.4s, v28.4s, v29.4s
addp v30.4s, v30.4s, v31.4s
dup v12.4s, wzr
cbz x3, NoReadBias
ld1 {v12.4s}, [x3]
NoReadBias:
......@@ -221,40 +222,40 @@ IndirectGemmInt8_4x4:
add v28.4s, v28.4s, v12.4s
dup v2.4s, w18
sqshl v16.4s, v16.4s ,v2.4s
sqshl v20.4s, v20.4s ,v2.4s
sqshl v24.4s, v24.4s ,v2.4s
sqshl v28.4s, v28.4s ,v2.4s
sqshl v16.4s, v16.4s, v2.4s
sqshl v20.4s, v20.4s, v2.4s
sqshl v24.4s, v24.4s, v2.4s
sqshl v28.4s, v28.4s, v2.4s
dup v3.4s, w17
sqrdmulh v16.4s, v16.4s ,v3.4s
sqrdmulh v20.4s, v20.4s ,v3.4s
sqrdmulh v24.4s, v24.4s ,v3.4s
sqrdmulh v28.4s, v28.4s ,v3.4s
sqrdmulh v16.4s, v16.4s, v3.4s
sqrdmulh v20.4s, v20.4s, v3.4s
sqrdmulh v24.4s, v24.4s, v3.4s
sqrdmulh v28.4s, v28.4s, v3.4s
dup v4.4s, w19
sqrshl v16.4s, v16.4s ,v4.4s
sqrshl v20.4s, v20.4s ,v4.4s
sqrshl v24.4s, v24.4s ,v4.4s
sqrshl v28.4s, v28.4s ,v4.4s
sqrshl v16.4s, v16.4s, v4.4s
sqrshl v20.4s, v20.4s, v4.4s
sqrshl v24.4s, v24.4s, v4.4s
sqrshl v28.4s, v28.4s, v4.4s
dup v5.4s, w16
add v16.4s, v16.4s ,v5.4s
add v20.4s, v20.4s ,v5.4s
add v24.4s, v24.4s ,v5.4s
add v28.4s, v28.4s ,v5.4s
add v16.4s, v16.4s, v5.4s
add v20.4s, v20.4s, v5.4s
add v24.4s, v24.4s, v5.4s
add v28.4s, v28.4s, v5.4s
dup v0.4s, w8
smax v16.4s, v16.4s ,v0.4s
smax v20.4s, v20.4s ,v0.4s
smax v24.4s, v24.4s ,v0.4s
smax v28.4s, v28.4s ,v0.4s
smax v16.4s, v16.4s, v0.4s
smax v20.4s, v20.4s, v0.4s
smax v24.4s, v24.4s, v0.4s
smax v28.4s, v28.4s, v0.4s
dup v1.4s, w9
smin v16.4s, v16.4s ,v1.4s
smin v20.4s, v20.4s ,v1.4s
smin v24.4s, v24.4s ,v1.4s
smin v28.4s, v28.4s ,v1.4s
smin v16.4s, v16.4s, v1.4s
smin v20.4s, v20.4s, v1.4s
smin v24.4s, v24.4s, v1.4s
smin v28.4s, v28.4s, v1.4s
sqxtn v13.4h, v16.4s
sqxtn2 v13.8h, v20.4s
......
......@@ -277,160 +277,160 @@ IndirectGemmInt8_24x4_dp:
Quantization:
dup v2.4s, w18
sqshl v8.4s, v8.4s ,v2.4s
sqshl v9.4s, v9.4s ,v2.4s
sqshl v10.4s, v10.4s ,v2.4s
sqshl v11.4s, v11.4s ,v2.4s
sqshl v12.4s, v12.4s ,v2.4s
sqshl v13.4s, v13.4s ,v2.4s
sqshl v14.4s, v14.4s ,v2.4s
sqshl v15.4s, v15.4s ,v2.4s
sqshl v16.4s, v16.4s ,v2.4s
sqshl v17.4s, v17.4s ,v2.4s
sqshl v18.4s, v18.4s ,v2.4s
sqshl v19.4s, v19.4s ,v2.4s
sqshl v20.4s, v20.4s ,v2.4s
sqshl v21.4s, v21.4s ,v2.4s
sqshl v22.4s, v22.4s ,v2.4s
sqshl v23.4s, v23.4s ,v2.4s
sqshl v24.4s, v24.4s ,v2.4s
sqshl v25.4s, v25.4s ,v2.4s
sqshl v26.4s, v26.4s ,v2.4s
sqshl v27.4s, v27.4s ,v2.4s
sqshl v28.4s, v28.4s ,v2.4s
sqshl v29.4s, v29.4s ,v2.4s
sqshl v30.4s, v30.4s ,v2.4s
sqshl v31.4s, v31.4s ,v2.4s
sqshl v8.4s, v8.4s, v2.4s
sqshl v9.4s, v9.4s, v2.4s
sqshl v10.4s, v10.4s, v2.4s
sqshl v11.4s, v11.4s, v2.4s
sqshl v12.4s, v12.4s, v2.4s
sqshl v13.4s, v13.4s, v2.4s
sqshl v14.4s, v14.4s, v2.4s
sqshl v15.4s, v15.4s, v2.4s
sqshl v16.4s, v16.4s, v2.4s
sqshl v17.4s, v17.4s, v2.4s
sqshl v18.4s, v18.4s, v2.4s
sqshl v19.4s, v19.4s, v2.4s
sqshl v20.4s, v20.4s, v2.4s
sqshl v21.4s, v21.4s, v2.4s
sqshl v22.4s, v22.4s, v2.4s
sqshl v23.4s, v23.4s, v2.4s
sqshl v24.4s, v24.4s, v2.4s
sqshl v25.4s, v25.4s, v2.4s
sqshl v26.4s, v26.4s, v2.4s
sqshl v27.4s, v27.4s, v2.4s
sqshl v28.4s, v28.4s, v2.4s
sqshl v29.4s, v29.4s, v2.4s
sqshl v30.4s, v30.4s, v2.4s
sqshl v31.4s, v31.4s, v2.4s
dup v3.4s, w17
sqrdmulh v8.4s, v8.4s ,v3.4s
sqrdmulh v9.4s, v9.4s ,v3.4s
sqrdmulh v10.4s, v10.4s ,v3.4s
sqrdmulh v11.4s, v11.4s ,v3.4s
sqrdmulh v12.4s, v12.4s ,v3.4s
sqrdmulh v13.4s, v13.4s ,v3.4s
sqrdmulh v14.4s, v14.4s ,v3.4s
sqrdmulh v15.4s, v15.4s ,v3.4s
sqrdmulh v16.4s, v16.4s ,v3.4s
sqrdmulh v17.4s, v17.4s ,v3.4s
sqrdmulh v18.4s, v18.4s ,v3.4s
sqrdmulh v19.4s, v19.4s ,v3.4s
sqrdmulh v20.4s, v20.4s ,v3.4s
sqrdmulh v21.4s, v21.4s ,v3.4s
sqrdmulh v22.4s, v22.4s ,v3.4s
sqrdmulh v23.4s, v23.4s ,v3.4s
sqrdmulh v24.4s, v24.4s ,v3.4s
sqrdmulh v25.4s, v25.4s ,v3.4s
sqrdmulh v26.4s, v26.4s ,v3.4s
sqrdmulh v27.4s, v27.4s ,v3.4s
sqrdmulh v28.4s, v28.4s ,v3.4s
sqrdmulh v29.4s, v29.4s ,v3.4s
sqrdmulh v30.4s, v30.4s ,v3.4s
sqrdmulh v31.4s, v31.4s ,v3.4s
sqrdmulh v8.4s, v8.4s, v3.4s
sqrdmulh v9.4s, v9.4s, v3.4s
sqrdmulh v10.4s, v10.4s, v3.4s
sqrdmulh v11.4s, v11.4s, v3.4s
sqrdmulh v12.4s, v12.4s, v3.4s
sqrdmulh v13.4s, v13.4s, v3.4s
sqrdmulh v14.4s, v14.4s, v3.4s
sqrdmulh v15.4s, v15.4s, v3.4s
sqrdmulh v16.4s, v16.4s, v3.4s
sqrdmulh v17.4s, v17.4s, v3.4s
sqrdmulh v18.4s, v18.4s, v3.4s
sqrdmulh v19.4s, v19.4s, v3.4s
sqrdmulh v20.4s, v20.4s, v3.4s
sqrdmulh v21.4s, v21.4s, v3.4s
sqrdmulh v22.4s, v22.4s, v3.4s
sqrdmulh v23.4s, v23.4s, v3.4s
sqrdmulh v24.4s, v24.4s, v3.4s
sqrdmulh v25.4s, v25.4s, v3.4s
sqrdmulh v26.4s, v26.4s, v3.4s
sqrdmulh v27.4s, v27.4s, v3.4s
sqrdmulh v28.4s, v28.4s, v3.4s
sqrdmulh v29.4s, v29.4s, v3.4s
sqrdmulh v30.4s, v30.4s, v3.4s
sqrdmulh v31.4s, v31.4s, v3.4s
dup v4.4s, w19
sqrshl v8.4s, v8.4s ,v4.4s
sqrshl v9.4s, v9.4s ,v4.4s
sqrshl v10.4s, v10.4s ,v4.4s
sqrshl v11.4s, v11.4s ,v4.4s
sqrshl v12.4s, v12.4s ,v4.4s
sqrshl v13.4s, v13.4s ,v4.4s
sqrshl v14.4s, v14.4s ,v4.4s
sqrshl v15.4s, v15.4s ,v4.4s
sqrshl v16.4s, v16.4s ,v4.4s
sqrshl v17.4s, v17.4s ,v4.4s
sqrshl v18.4s, v18.4s ,v4.4s
sqrshl v19.4s, v19.4s ,v4.4s
sqrshl v20.4s, v20.4s ,v4.4s
sqrshl v21.4s, v21.4s ,v4.4s
sqrshl v22.4s, v22.4s ,v4.4s
sqrshl v23.4s, v23.4s ,v4.4s
sqrshl v24.4s, v24.4s ,v4.4s
sqrshl v25.4s, v25.4s ,v4.4s
sqrshl v26.4s, v26.4s ,v4.4s
sqrshl v27.4s, v27.4s ,v4.4s
sqrshl v28.4s, v28.4s ,v4.4s
sqrshl v29.4s, v29.4s ,v4.4s
sqrshl v30.4s, v30.4s ,v4.4s
sqrshl v31.4s, v31.4s ,v4.4s
sqrshl v8.4s, v8.4s, v4.4s
sqrshl v9.4s, v9.4s, v4.4s
sqrshl v10.4s, v10.4s, v4.4s
sqrshl v11.4s, v11.4s, v4.4s
sqrshl v12.4s, v12.4s, v4.4s
sqrshl v13.4s, v13.4s, v4.4s
sqrshl v14.4s, v14.4s, v4.4s
sqrshl v15.4s, v15.4s, v4.4s
sqrshl v16.4s, v16.4s, v4.4s
sqrshl v17.4s, v17.4s, v4.4s
sqrshl v18.4s, v18.4s, v4.4s
sqrshl v19.4s, v19.4s, v4.4s
sqrshl v20.4s, v20.4s, v4.4s
sqrshl v21.4s, v21.4s, v4.4s
sqrshl v22.4s, v22.4s, v4.4s
sqrshl v23.4s, v23.4s, v4.4s
sqrshl v24.4s, v24.4s, v4.4s
sqrshl v25.4s, v25.4s, v4.4s
sqrshl v26.4s, v26.4s, v4.4s
sqrshl v27.4s, v27.4s, v4.4s
sqrshl v28.4s, v28.4s, v4.4s
sqrshl v29.4s, v29.4s, v4.4s
sqrshl v30.4s, v30.4s, v4.4s
sqrshl v31.4s, v31.4s, v4.4s
dup v5.4s, w16
add v8.4s, v8.4s ,v5.4s
add v9.4s, v9.4s ,v5.4s
add v10.4s, v10.4s ,v5.4s
add v11.4s, v11.4s ,v5.4s
add v12.4s, v12.4s ,v5.4s
add v13.4s, v13.4s ,v5.4s
add v14.4s, v14.4s ,v5.4s
add v15.4s, v15.4s ,v5.4s
add v16.4s, v16.4s ,v5.4s
add v17.4s, v17.4s ,v5.4s
add v18.4s, v18.4s ,v5.4s
add v19.4s, v19.4s ,v5.4s
add v20.4s, v20.4s ,v5.4s
add v21.4s, v21.4s ,v5.4s
add v22.4s, v22.4s ,v5.4s
add v23.4s, v23.4s ,v5.4s
add v24.4s, v24.4s ,v5.4s
add v25.4s, v25.4s ,v5.4s
add v26.4s, v26.4s ,v5.4s
add v27.4s, v27.4s ,v5.4s
add v28.4s, v28.4s ,v5.4s
add v29.4s, v29.4s ,v5.4s
add v30.4s, v30.4s ,v5.4s
add v31.4s, v31.4s ,v5.4s
add v8.4s, v8.4s, v5.4s
add v9.4s, v9.4s, v5.4s
add v10.4s, v10.4s, v5.4s
add v11.4s, v11.4s, v5.4s
add v12.4s, v12.4s, v5.4s
add v13.4s, v13.4s, v5.4s
add v14.4s, v14.4s, v5.4s
add v15.4s, v15.4s, v5.4s
add v16.4s, v16.4s, v5.4s
add v17.4s, v17.4s, v5.4s
add v18.4s, v18.4s, v5.4s
add v19.4s, v19.4s, v5.4s
add v20.4s, v20.4s, v5.4s
add v21.4s, v21.4s, v5.4s
add v22.4s, v22.4s, v5.4s
add v23.4s, v23.4s, v5.4s
add v24.4s, v24.4s, v5.4s
add v25.4s, v25.4s, v5.4s
add v26.4s, v26.4s, v5.4s
add v27.4s, v27.4s, v5.4s
add v28.4s, v28.4s, v5.4s
add v29.4s, v29.4s, v5.4s
add v30.4s, v30.4s, v5.4s
add v31.4s, v31.4s, v5.4s
dup v0.4s, w8
smax v8.4s, v8.4s ,v0.4s
smax v9.4s, v9.4s ,v0.4s
smax v10.4s, v10.4s ,v0.4s
smax v11.4s, v11.4s ,v0.4s
smax v12.4s, v12.4s ,v0.4s
smax v13.4s, v13.4s ,v0.4s
smax v14.4s, v14.4s ,v0.4s
smax v15.4s, v15.4s ,v0.4s
smax v16.4s, v16.4s ,v0.4s
smax v17.4s, v17.4s ,v0.4s
smax v18.4s, v18.4s ,v0.4s
smax v19.4s, v19.4s ,v0.4s
smax v20.4s, v20.4s ,v0.4s
smax v21.4s, v21.4s ,v0.4s
smax v22.4s, v22.4s ,v0.4s
smax v23.4s, v23.4s ,v0.4s
smax v24.4s, v24.4s ,v0.4s
smax v25.4s, v25.4s ,v0.4s
smax v26.4s, v26.4s ,v0.4s
smax v27.4s, v27.4s ,v0.4s
smax v28.4s, v28.4s ,v0.4s
smax v29.4s, v29.4s ,v0.4s
smax v30.4s, v30.4s ,v0.4s
smax v31.4s, v31.4s ,v0.4s
smax v8.4s, v8.4s, v0.4s
smax v9.4s, v9.4s, v0.4s
smax v10.4s, v10.4s, v0.4s
smax v11.4s, v11.4s, v0.4s
smax v12.4s, v12.4s, v0.4s
smax v13.4s, v13.4s, v0.4s
smax v14.4s, v14.4s, v0.4s
smax v15.4s, v15.4s, v0.4s
smax v16.4s, v16.4s, v0.4s
smax v17.4s, v17.4s, v0.4s
smax v18.4s, v18.4s, v0.4s
smax v19.4s, v19.4s, v0.4s
smax v20.4s, v20.4s, v0.4s
smax v21.4s, v21.4s, v0.4s
smax v22.4s, v22.4s, v0.4s
smax v23.4s, v23.4s, v0.4s
smax v24.4s, v24.4s, v0.4s
smax v25.4s, v25.4s, v0.4s
smax v26.4s, v26.4s, v0.4s
smax v27.4s, v27.4s, v0.4s
smax v28.4s, v28.4s, v0.4s
smax v29.4s, v29.4s, v0.4s
smax v30.4s, v30.4s, v0.4s
smax v31.4s, v31.4s, v0.4s
dup v1.4s, w9
smin v8.4s, v8.4s ,v1.4s
smin v9.4s, v9.4s ,v1.4s
smin v10.4s, v10.4s ,v1.4s
smin v11.4s, v11.4s ,v1.4s
smin v12.4s, v12.4s ,v1.4s
smin v13.4s, v13.4s ,v1.4s
smin v14.4s, v14.4s ,v1.4s
smin v15.4s, v15.4s ,v1.4s
smin v16.4s, v16.4s ,v1.4s
smin v17.4s, v17.4s ,v1.4s
smin v18.4s, v18.4s ,v1.4s
smin v19.4s, v19.4s ,v1.4s
smin v20.4s, v20.4s ,v1.4s
smin v21.4s, v21.4s ,v1.4s
smin v22.4s, v22.4s ,v1.4s
smin v23.4s, v23.4s ,v1.4s
smin v24.4s, v24.4s ,v1.4s
smin v25.4s, v25.4s ,v1.4s
smin v26.4s, v26.4s ,v1.4s
smin v27.4s, v27.4s ,v1.4s
smin v28.4s, v28.4s ,v1.4s
smin v29.4s, v29.4s ,v1.4s
smin v30.4s, v30.4s ,v1.4s
smin v31.4s, v31.4s ,v1.4s
smin v8.4s, v8.4s, v1.4s
smin v9.4s, v9.4s, v1.4s
smin v10.4s, v10.4s, v1.4s
smin v11.4s, v11.4s, v1.4s
smin v12.4s, v12.4s, v1.4s
smin v13.4s, v13.4s, v1.4s
smin v14.4s, v14.4s, v1.4s
smin v15.4s, v15.4s, v1.4s
smin v16.4s, v16.4s, v1.4s
smin v17.4s, v17.4s, v1.4s
smin v18.4s, v18.4s, v1.4s
smin v19.4s, v19.4s, v1.4s
smin v20.4s, v20.4s, v1.4s
smin v21.4s, v21.4s, v1.4s
smin v22.4s, v22.4s, v1.4s
smin v23.4s, v23.4s, v1.4s
smin v24.4s, v24.4s, v1.4s
smin v25.4s, v25.4s, v1.4s
smin v26.4s, v26.4s, v1.4s
smin v27.4s, v27.4s, v1.4s
smin v28.4s, v28.4s, v1.4s
smin v29.4s, v29.4s, v1.4s
smin v30.4s, v30.4s, v1.4s
smin v31.4s, v31.4s, v1.4s
sqxtn v6.4h, v8.4s
sqxtn2 v6.8h, v9.4s
......
......@@ -29,11 +29,13 @@ void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const in
int32_t act_min = conv_param->conv_quant_arg_.out_act_min_[0];
int32_t act_max = conv_param->conv_quant_arg_.out_act_max_[0];
#ifdef __aarch64__
IndirectGemmInt8_4x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t),
input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after);
IndirectGemmInt8_4x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel,
output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier,
shift_before, shift_after);
#elif defined(ENABLE_ARM32)
IndirectGemmInt8_2x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t),
input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after);
IndirectGemmInt8_2x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel,
output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier,
shift_before, shift_after);
#else
int tile_num = conv_param->tile_num_;
int plane_c4 = UP_DIV(kernel_plane, C4NUM);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册