提交 1ea8ae50 编写于 作者: A Andy Polyakov

poly1305/asm/poly1305-*.pl: flip horizontal add and reduction.

Formally only 32-bit AVX2 code path needs this, but I choose to
harmonize all vector code paths.

RT#4346
Reviewed-by: NRichard Levitte <levitte@openssl.org>
上级 bdbd3aea
......@@ -1056,6 +1056,15 @@ poly1305_blocks_neon:
vmlal.u32 $D2,$H3#lo,$S4
.Lshort_tail:
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ horizontal addition
vadd.i64 $D3#lo,$D3#lo,$D3#hi
vadd.i64 $D0#lo,$D0#lo,$D0#hi
vadd.i64 $D4#lo,$D4#lo,$D4#hi
vadd.i64 $D1#lo,$D1#lo,$D1#hi
vadd.i64 $D2#lo,$D2#lo,$D2#hi
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ lazy reduction, but without narrowing
......@@ -1086,15 +1095,6 @@ poly1305_blocks_neon:
vadd.i64 $D1,$D1,$T0 @ h0 -> h1
vadd.i64 $D4,$D4,$T1 @ h3 -> h4
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ horizontal addition
vadd.i64 $D2#lo,$D2#lo,$D2#hi
vadd.i64 $D0#lo,$D0#lo,$D0#hi
vadd.i64 $D3#lo,$D3#lo,$D3#hi
vadd.i64 $D1#lo,$D1#lo,$D1#hi
vadd.i64 $D4#lo,$D4#lo,$D4#hi
cmp $len,#0
bne .Leven
......
......@@ -790,6 +790,19 @@ poly1305_blocks_neon:
umlal $ACC2,$IN01_4,${S3}
.Lshort_tail:
////////////////////////////////////////////////////////////////
// horizontal add
addp $ACC3,$ACC3,$ACC3
ldp d8,d9,[sp,#16] // meet ABI requirements
addp $ACC0,$ACC0,$ACC0
ldp d10,d11,[sp,#32]
addp $ACC4,$ACC4,$ACC4
ldp d12,d13,[sp,#48]
addp $ACC1,$ACC1,$ACC1
ldp d14,d15,[sp,#64]
addp $ACC2,$ACC2,$ACC2
////////////////////////////////////////////////////////////////
// lazy reduction, but without narrowing
......@@ -821,19 +834,6 @@ poly1305_blocks_neon:
add $ACC1,$ACC1,$T0.2d // h0 -> h1
add $ACC4,$ACC4,$T1.2d // h3 -> h4
////////////////////////////////////////////////////////////////
// horizontal add
addp $ACC2,$ACC2,$ACC2
ldp d8,d9,[sp,#16] // meet ABI requirements
addp $ACC0,$ACC0,$ACC0
ldp d10,d11,[sp,#32]
addp $ACC1,$ACC1,$ACC1
ldp d12,d13,[sp,#48]
addp $ACC3,$ACC3,$ACC3
ldp d14,d15,[sp,#64]
addp $ACC4,$ACC4,$ACC4
////////////////////////////////////////////////////////////////
// write the result, can be partially reduced
......
......@@ -536,6 +536,8 @@ my $base = shift; $base = "esp" if (!defined($base));
},"edx");
sub lazy_reduction {
my $extra = shift;
################################################################
# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
# and P. Schwabe
......@@ -543,6 +545,7 @@ sub lazy_reduction {
&movdqa ($T0,$D3);
&pand ($D3,$MASK);
&psrlq ($T0,26);
&$extra () if (defined($extra));
&paddq ($T0,$D4); # h3 -> h4
&movdqa ($T1,$D0);
&pand ($D0,$MASK);
......@@ -1091,21 +1094,21 @@ my $addr = shift;
&set_label("short_tail");
&lazy_reduction ();
################################################################
# horizontal addition
&pshufd ($T1,$D4,0b01001110);
&pshufd ($T0,$D3,0b01001110);
&paddq ($D4,$T1);
&paddq ($D3,$T0);
&pshufd ($T1,$D0,0b01001110);
&pshufd ($T0,$D1,0b01001110);
&paddd ($D0,$T1);
&paddq ($D0,$T1);
&paddq ($D1,$T0);
&pshufd ($T1,$D2,0b01001110);
&paddd ($D1,$T0);
&pshufd ($T0,$D3,0b01001110);
&paddd ($D2,$T1);
&pshufd ($T1,$D4,0b01001110);
&paddd ($D3,$T0);
&paddd ($D4,$T1);
#&paddq ($D2,$T1);
&lazy_reduction (sub { &paddq ($D2,$T1) });
&set_label("done");
&movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value
......@@ -1113,8 +1116,8 @@ my $addr = shift;
&movd (&DWP(-16*3+4*2,"edi"),$D2);
&movd (&DWP(-16*3+4*3,"edi"),$D3);
&movd (&DWP(-16*3+4*4,"edi"),$D4);
&set_label("nodata");
&mov ("esp","ebp");
&set_label("nodata");
&function_end("_poly1305_blocks_sse2");
&align (32);
......@@ -1435,7 +1438,7 @@ sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; }
&test ("eax","eax"); # is_base2_26?
&jz (&label("enter_blocks"));
&set_label("enter_avx2",16);
&set_label("enter_avx2");
&vzeroupper ();
&call (&label("pic_point"));
......@@ -1731,31 +1734,31 @@ sub vlazy_reduction {
&vpmuladd (sub { my $i=shift; &QWP(4+32*$i-128,"edx"); });
&vlazy_reduction();
################################################################
# horizontal addition
&vpsrldq ($T0,$D4,8);
&vpsrldq ($T1,$D3,8);
&vpaddq ($D4,$D4,$T0);
&vpsrldq ($T0,$D0,8);
&vpaddq ($D3,$D3,$T1);
&vpsrldq ($T1,$D1,8);
&vpaddq ($D0,$D0,$T0);
&vpsrldq ($T0,$D2,8);
&vpaddq ($D1,$D1,$T1);
&vpsrldq ($T1,$D3,8);
&vpermq ($T1,$D4,2); # keep folding
&vpaddq ($D2,$D2,$T0);
&vpsrldq ($T0,$D4,8);
&vpaddq ($D3,$D3,$T1);
&vpermq ($T1,$D0,2); # keep folding
&vpaddq ($D4,$D4,$T0);
&vpermq ($T0,$D3,2);
&vpaddq ($D4,$D4,$T1);
&vpermq ($T1,$D0,2);
&vpaddq ($D3,$D3,$T0);
&vpermq ($T0,$D1,2);
&vpaddq ($D0,$D0,$T1);
&vpermq ($T1,$D2,2);
&vpaddq ($D1,$D1,$T0);
&vpermq ($T0,$D3,2);
&vpaddq ($D2,$D2,$T1);
&vpermq ($T1,$D4,2);
&vpaddq ($D3,$D3,$T0);
&vpaddq ($D4,$D4,$T1);
&vlazy_reduction();
&cmp ("ecx",0);
&je (&label("done"));
......@@ -1772,14 +1775,14 @@ sub vlazy_reduction {
&jmp (&label("even"));
&set_label("done",16);
&vmovd (&DWP(-16*3+4*0,"edi"),"xmm0"); # store hash value
&vmovd (&DWP(-16*3+4*1,"edi"),"xmm1");
&vmovd (&DWP(-16*3+4*2,"edi"),"xmm2");
&vmovd (&DWP(-16*3+4*3,"edi"),"xmm3");
&vmovd (&DWP(-16*3+4*4,"edi"),"xmm4");
&vmovd (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
&vmovd (&DWP(-16*3+4*1,"edi"),&X($D1));
&vmovd (&DWP(-16*3+4*2,"edi"),&X($D2));
&vmovd (&DWP(-16*3+4*3,"edi"),&X($D3));
&vmovd (&DWP(-16*3+4*4,"edi"),&X($D4));
&vzeroupper ();
&set_label("nodata");
&mov ("esp","ebp");
&set_label("nodata");
&function_end("_poly1305_blocks_avx2");
}
&set_label("const_sse2",64);
......
......@@ -1197,6 +1197,20 @@ $code.=<<___;
vpaddq $T3,$D0,$D0 # d0 += h1*s4
.Lshort_tail_avx:
################################################################
# horizontal addition
vpsrldq \$8,$D4,$T4
vpsrldq \$8,$D3,$T3
vpsrldq \$8,$D1,$T1
vpsrldq \$8,$D0,$T0
vpsrldq \$8,$D2,$T2
vpaddq $T3,$D3,$D3
vpaddq $T4,$D4,$D4
vpaddq $T0,$D0,$D0
vpaddq $T1,$D1,$D1
vpaddq $T2,$D2,$D2
################################################################
# lazy reduction
......@@ -1231,25 +1245,11 @@ $code.=<<___;
vpand $MASK,$D3,$D3
vpaddq $H3,$D4,$D4 # h3 -> h4
################################################################
# horizontal addition
vpsrldq \$8,$D2,$T2
vpsrldq \$8,$D0,$T0
vpsrldq \$8,$D1,$T1
vpsrldq \$8,$D3,$T3
vpsrldq \$8,$D4,$T4
vpaddq $T2,$D2,$H2
vpaddq $T0,$D0,$H0
vpaddq $T1,$D1,$H1
vpaddq $T3,$D3,$H3
vpaddq $T4,$D4,$H4
vmovd $H0,`4*0-48-64`($ctx) # save partially reduced
vmovd $H1,`4*1-48-64`($ctx)
vmovd $H2,`4*2-48-64`($ctx)
vmovd $H3,`4*3-48-64`($ctx)
vmovd $H4,`4*4-48-64`($ctx)
vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
vmovd $D1,`4*1-48-64`($ctx)
vmovd $D2,`4*2-48-64`($ctx)
vmovd $D3,`4*3-48-64`($ctx)
vmovd $D4,`4*4-48-64`($ctx)
___
$code.=<<___ if ($win64);
vmovdqa 0x50(%r11),%xmm6
......@@ -1887,6 +1887,31 @@ $code.=<<___;
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
################################################################
# horizontal addition
vpsrldq \$8,$D1,$T1
vpsrldq \$8,$H2,$T2
vpsrldq \$8,$H3,$T3
vpsrldq \$8,$H4,$T4
vpsrldq \$8,$H0,$T0
vpaddq $T1,$D1,$D1
vpaddq $T2,$H2,$H2
vpaddq $T3,$H3,$H3
vpaddq $T4,$H4,$H4
vpaddq $T0,$H0,$H0
vpermq \$0x2,$H3,$T3
vpermq \$0x2,$H4,$T4
vpermq \$0x2,$H0,$T0
vpermq \$0x2,$D1,$T1
vpermq \$0x2,$H2,$T2
vpaddq $T3,$H3,$H3
vpaddq $T4,$H4,$H4
vpaddq $T0,$H0,$H0
vpaddq $T1,$D1,$D1
vpaddq $T2,$H2,$H2
################################################################
# lazy reduction
......@@ -1921,31 +1946,6 @@ $code.=<<___;
vpand $MASK,$H3,$H3
vpaddq $D3,$H4,$H4 # h3 -> h4
################################################################
# horizontal addition
vpsrldq \$8,$H2,$T2
vpsrldq \$8,$H0,$T0
vpsrldq \$8,$H1,$T1
vpsrldq \$8,$H3,$T3
vpsrldq \$8,$H4,$T4
vpaddq $T2,$H2,$H2
vpaddq $T0,$H0,$H0
vpaddq $T1,$H1,$H1
vpaddq $T3,$H3,$H3
vpaddq $T4,$H4,$H4
vpermq \$0x2,$H2,$T2
vpermq \$0x2,$H0,$T0
vpermq \$0x2,$H1,$T1
vpermq \$0x2,$H3,$T3
vpermq \$0x2,$H4,$T4
vpaddq $T2,$H2,$H2
vpaddq $T0,$H0,$H0
vpaddq $T1,$H1,$H1
vpaddq $T3,$H3,$H3
vpaddq $T4,$H4,$H4
vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
vmovd %x#$H1,`4*1-48-64`($ctx)
vmovd %x#$H2,`4*2-48-64`($ctx)
......
......@@ -667,6 +667,20 @@ static const struct poly1305_test poly1305_tests[] = {
"95d5c005503e510d8cd0aa072c4a4d06""6eabc52d11653df47fbf63ab198bcc26",
"f248312e578d9d58f8b7bb4d19105431"
},
/*
* AVX2 in poly1305-x86.pl failed this with 176+32 split
*/
{
"248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd"
"2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e8"
"74cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c"
"8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936a"
"ff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a37"
"09894e4eb0a4eedc4ae19468e66b81f2"
"71351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb",
"000102030405060708090a0b0c0d0e0f""00000000000000000000000000000000",
"bc939bc5281480fa99c6d68c258ec42f"
},
/*
* test vectors from Google
*/
......@@ -844,6 +858,23 @@ int main()
printf("\n");
return 1;
}
for (half = 16; half < inlen; half += 16) {
Poly1305_Init(&poly1305, key);
Poly1305_Update(&poly1305, in, half);
Poly1305_Update(&poly1305, in+half, inlen-half);
Poly1305_Final(&poly1305, out);
if (memcmp(out, expected, sizeof(expected)) != 0) {
printf("Poly1305 test #%d/%d failed.\n", i, half);
printf("got: ");
hexdump(out, sizeof(out));
printf("\nexpected: ");
hexdump(expected, sizeof(expected));
printf("\n");
return 1;
}
}
}
free(in);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册