提交 73e8a5c8 编写于 作者: A Andy Polyakov

poly1305/asm/poly1305-x86_64.pl: switch to vpermdd in table expansion.

Effectively it's minor size optimization, 5-6% per affected subroutine.
Reviewed-by: NRich Salz <rsalz@openssl.org>
上级 c1e1fc50
...@@ -1631,8 +1631,9 @@ $code.=<<___ if ($win64); ...@@ -1631,8 +1631,9 @@ $code.=<<___ if ($win64);
.Ldo_avx2_body: .Ldo_avx2_body:
___ ___
$code.=<<___; $code.=<<___;
lea 48+64($ctx),$ctx # size optimization
lea .Lconst(%rip),%rcx lea .Lconst(%rip),%rcx
lea 48+64($ctx),$ctx # size optimization
vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
# expand and copy pre-calculated table to stack # expand and copy pre-calculated table to stack
vmovdqu `16*0-64`($ctx),%x#$T2 vmovdqu `16*0-64`($ctx),%x#$T2
...@@ -1642,36 +1643,28 @@ $code.=<<___; ...@@ -1642,36 +1643,28 @@ $code.=<<___;
vmovdqu `16*3-64`($ctx),%x#$D0 vmovdqu `16*3-64`($ctx),%x#$D0
vmovdqu `16*4-64`($ctx),%x#$D1 vmovdqu `16*4-64`($ctx),%x#$D1
vmovdqu `16*5-64`($ctx),%x#$D2 vmovdqu `16*5-64`($ctx),%x#$D2
lea 0x90(%rsp),%rax # size optimization
vmovdqu `16*6-64`($ctx),%x#$D3 vmovdqu `16*6-64`($ctx),%x#$D3
vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434 vpermd $T2,$T0,$T2 # 00003412 -> 14243444
vmovdqu `16*7-64`($ctx),%x#$D4 vmovdqu `16*7-64`($ctx),%x#$D4
vpermq \$0x15,$T3,$T3 vpermd $T3,$T0,$T3
vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444
vmovdqu `16*8-64`($ctx),%x#$MASK vmovdqu `16*8-64`($ctx),%x#$MASK
vpermq \$0x15,$T4,$T4 vpermd $T4,$T0,$T4
vpshufd \$0xc8,$T3,$T3
vmovdqa $T2,0x00(%rsp) vmovdqa $T2,0x00(%rsp)
vpermq \$0x15,$D0,$D0 vpermd $D0,$T0,$D0
vpshufd \$0xc8,$T4,$T4 vmovdqa $T3,0x20-0x90(%rax)
vmovdqa $T3,0x20(%rsp) vpermd $D1,$T0,$D1
vpermq \$0x15,$D1,$D1 vmovdqa $T4,0x40-0x90(%rax)
vpshufd \$0xc8,$D0,$D0 vpermd $D2,$T0,$D2
vmovdqa $T4,0x40(%rsp) vmovdqa $D0,0x60-0x90(%rax)
vpermq \$0x15,$D2,$D2 vpermd $D3,$T0,$D3
vpshufd \$0xc8,$D1,$D1 vmovdqa $D1,0x80-0x90(%rax)
vmovdqa $D0,0x60(%rsp) vpermd $D4,$T0,$D4
vpermq \$0x15,$D3,$D3 vmovdqa $D2,0xa0-0x90(%rax)
vpshufd \$0xc8,$D2,$D2 vpermd $MASK,$T0,$MASK
vmovdqa $D1,0x80(%rsp) vmovdqa $D3,0xc0-0x90(%rax)
vpermq \$0x15,$D4,$D4 vmovdqa $D4,0xe0-0x90(%rax)
vpshufd \$0xc8,$D3,$D3 vmovdqa $MASK,0x100-0x90(%rax)
vmovdqa $D2,0xa0(%rsp)
vpermq \$0x15,$MASK,$MASK
vpshufd \$0xc8,$D4,$D4
vmovdqa $D3,0xc0(%rsp)
vpshufd \$0xc8,$MASK,$MASK
vmovdqa $D4,0xe0(%rsp)
vmovdqa $MASK,0x100(%rsp)
vmovdqa 64(%rcx),$MASK # .Lmask26 vmovdqa 64(%rcx),$MASK # .Lmask26
################################################################ ################################################################
...@@ -1698,7 +1691,6 @@ $code.=<<___; ...@@ -1698,7 +1691,6 @@ $code.=<<___;
vpand $MASK,$T3,$T3 # 3 vpand $MASK,$T3,$T3 # 3
vpor 32(%rcx),$T4,$T4 # padbit, yes, always vpor 32(%rcx),$T4,$T4 # padbit, yes, always
lea 0x90(%rsp),%rax # size optimization
vpaddq $H2,$T2,$H2 # accumulate input vpaddq $H2,$T2,$H2 # accumulate input
sub \$64,$len sub \$64,$len
jz .Ltail_avx2 jz .Ltail_avx2
...@@ -2055,8 +2047,9 @@ $code.=<<___ if ($win64); ...@@ -2055,8 +2047,9 @@ $code.=<<___ if ($win64);
.Ldo_avx512_body: .Ldo_avx512_body:
___ ___
$code.=<<___; $code.=<<___;
lea 48+64($ctx),$ctx # size optimization
lea .Lconst(%rip),%rcx lea .Lconst(%rip),%rcx
lea 48+64($ctx),$ctx # size optimization
vmovdqa 96(%rcx),$T2 # .Lpermd_avx2
# expand pre-calculated table # expand pre-calculated table
vmovdqu32 `16*0-64`($ctx),%x#$R0 vmovdqu32 `16*0-64`($ctx),%x#$R0
...@@ -2069,33 +2062,23 @@ $code.=<<___; ...@@ -2069,33 +2062,23 @@ $code.=<<___;
vmovdqu32 `16*6-64`($ctx),%x#$S3 vmovdqu32 `16*6-64`($ctx),%x#$S3
vmovdqu32 `16*7-64`($ctx),%x#$R4 vmovdqu32 `16*7-64`($ctx),%x#$R4
vmovdqu32 `16*8-64`($ctx),%x#$S4 vmovdqu32 `16*8-64`($ctx),%x#$S4
vpermq \$0x15,$R0,$R0 # 00003412 -> 12343434 vpermd $R0,$T2,$R0 # 00003412 -> 14243444
vmovdqa64 64(%rcx),$MASK # .Lmask26 vmovdqa64 64(%rcx),$MASK # .Lmask26
vpermq \$0x15,$R1,$R1 vpermd $R1,$T2,$R1
vmovdqa32 128(%rcx),$GATHER # .Lgather vpermd $S1,$T2,$S1
vpermq \$0x15,$S1,$S1 vpermd $R2,$T2,$R2
vpshufd \$0xc8,$R0,$R0 # 12343434 -> 14243444
vpermq \$0x15,$R2,$R2
vpshufd \$0xc8,$R1,$R1
vmovdqa32 $R0,0x00(%rsp) # save in case $len%128 != 0 vmovdqa32 $R0,0x00(%rsp) # save in case $len%128 != 0
vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
vpermq \$0x15,$S2,$S2 vpermd $S2,$T2,$S2
vpshufd \$0xc8,$S1,$S1
vmovdqa32 $R1,0x20(%rsp) vmovdqa32 $R1,0x20(%rsp)
vpsrlq \$32,$R1,$T1 vpsrlq \$32,$R1,$T1
vpermq \$0x15,$R3,$R3 vpermd $R3,$T2,$R3
vpshufd \$0xc8,$R2,$R2
vmovdqa32 $S1,0x40(%rsp) vmovdqa32 $S1,0x40(%rsp)
vpermq \$0x15,$S3,$S3 vpermd $S3,$T2,$S3
vpshufd \$0xc8,$S2,$S2 vpermd $R4,$T2,$R4
vpermq \$0x15,$R4,$R4
vpshufd \$0xc8,$R3,$R3
vmovdqa32 $R2,0x60(%rsp) vmovdqa32 $R2,0x60(%rsp)
vpermq \$0x15,$S4,$S4 vpermd $S4,$T2,$S4
vpshufd \$0xc8,$S3,$S3
vmovdqa32 $S2,0x80(%rsp) vmovdqa32 $S2,0x80(%rsp)
vpshufd \$0xc8,$R4,$R4
vpshufd \$0xc8,$S4,$S4
vmovdqa32 $R3,0xa0(%rsp) vmovdqa32 $R3,0xa0(%rsp)
vmovdqa32 $S3,0xc0(%rsp) vmovdqa32 $S3,0xc0(%rsp)
vmovdqa32 $R4,0xe0(%rsp) vmovdqa32 $R4,0xe0(%rsp)
...@@ -2275,14 +2258,16 @@ $code.=<<___; ...@@ -2275,14 +2258,16 @@ $code.=<<___;
vpandq $MASK,$T2,$T2 # 2 vpandq $MASK,$T2,$T2 # 2
vpandq $MASK,$T0,$T0 # 0 vpandq $MASK,$T0,$T0 # 0
vpandq $MASK,$T1,$T1 # 1 vpandq $MASK,$T1,$T1 # 1
#vpandq $MASK,$T3,$T3 # 3 vpandq $MASK,$T3,$T3 # 3
#vporq $PADBIT,$T4,$T4 # padbit, yes, always #vporq $PADBIT,$T4,$T4 # padbit, yes, always
vpaddq $H2,$T2,$H2 # accumulate input vpaddq $H2,$T2,$H2 # accumulate input
mov \$0x0f,%eax mov \$0x0f,%eax
sub \$192,$len sub \$192,$len
jbe .Ltail_avx512 jbe .Ltail_avx512
jmp .Loop_avx512
.align 32
.Loop_avx512: .Loop_avx512:
################################################################ ################################################################
# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
...@@ -2316,7 +2301,6 @@ $code.=<<___; ...@@ -2316,7 +2301,6 @@ $code.=<<___;
vpaddq $H0,$T0,$H0 vpaddq $H0,$T0,$H0
vpmuludq $H2,$R2,$D4 # d4 = h2*r2 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
vpmuludq $H2,$S3,$D0 # d0 = h2*s3 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
vpandq $MASK,$T3,$T3 # 3, module-scheduled
vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
vporq $PADBIT,$T4,$T4 # padbit, yes, always vporq $PADBIT,$T4,$T4 # padbit, yes, always
vpmuludq $H2,$R0,$D2 # d2 = h2*r0 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
...@@ -2425,7 +2409,7 @@ $code.=<<___; ...@@ -2425,7 +2409,7 @@ $code.=<<___;
vpandq $MASK,$T0,$T0 # 0 vpandq $MASK,$T0,$T0 # 0
vpandq $MASK,$T1,$T1 # 1 vpandq $MASK,$T1,$T1 # 1
#vpandq $MASK,$T3,$T3 # 3 vpandq $MASK,$T3,$T3 # 3
#vporq $PADBIT,$T4,$T4 # padbit, yes, always #vporq $PADBIT,$T4,$T4 # padbit, yes, always
sub \$128,$len sub \$128,$len
...@@ -2459,7 +2443,6 @@ $code.=<<___; ...@@ -2459,7 +2443,6 @@ $code.=<<___;
vpmuludq $H2,$S3,$D0 # d0 = h2*s3 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
vpmuludq $H2,$R0,$D2 # d2 = h2*r0 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
vpandq $MASK,$T3,$T3 # 3, module-scheduled
vporq $PADBIT,$T4,$T4 # padbit, yes, always vporq $PADBIT,$T4,$T4 # padbit, yes, always
vpaddq $H1,$T1,$H1 # accumulate input vpaddq $H1,$T1,$H1 # accumulate input
vpaddq $H3,$T3,$H3 vpaddq $H3,$T3,$H3
...@@ -2647,10 +2630,8 @@ $code.=<<___; ...@@ -2647,10 +2630,8 @@ $code.=<<___;
.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
.Lmask26: .Lmask26:
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
.Lfive: .Lpermd_avx2:
.long 5,0,5,0,5,0,5,0 .long 2,2,2,3,2,0,2,1
.Lgather:
.long 0,8, 32,40, 64,72, 96,104
___ ___
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册