提交 053fa39a 编写于 作者: R Richard Levitte

Conversion to UTF-8 where needed

This leaves behind files with names ending with '.iso-8859-1'.  These
should be safe to remove.  If something went wrong when re-encoding,
there will be some files with names ending with '.utf8' left behind.
Reviewed-by: NRich Salz <rsalz@openssl.org>
上级 f608b406
此差异已折叠。
......@@ -45,7 +45,7 @@
# the undertaken effort was that it appeared that in tight IA-32
# register window little-endian flavor could achieve slightly higher
# Instruction Level Parallelism, and it indeed resulted in up to 15%
# better performance on most recent µ-archs...
# better performance on most recent µ-archs...
#
# Third version adds AES_cbc_encrypt implementation, which resulted in
# up to 40% performance imrovement of CBC benchmark results. 40% was
......@@ -224,7 +224,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
$speed_limit=512; # chunks smaller than $speed_limit are
# processed with compact routine in CBC mode
$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
# recent µ-archs], but ~5 times smaller!
# recent µ-archs], but ~5 times smaller!
# I favor compact code to minimize cache
# contention and in hope to "collect" 5% back
# in real-life applications...
......@@ -565,7 +565,7 @@ sub enctransform()
# Performance is not actually extraordinary in comparison to pure
# x86 code. In particular encrypt performance is virtually the same.
# Decrypt performance on the other hand is 15-20% better on newer
# µ-archs [but we're thankful for *any* improvement here], and ~50%
# µ-archs [but we're thankful for *any* improvement here], and ~50%
# better on PIII:-) And additionally on the pros side this code
# eliminates redundant references to stack and thus relieves/
# minimizes the pressure on the memory bus.
......
......@@ -891,7 +891,7 @@ ret?: ; B0 holds rounds or zero
MVC B0,ILC
|| SUB B0,1,B0
GMPY4 $K[0],A24,$Kx9[0] ; 0x09
GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
|| GMPY4 $K[1],A24,$Kx9[1]
|| MVK 0x00000D0D,A25
|| MVK 0x00000E0E,B25
......@@ -900,14 +900,14 @@ ret?: ; B0 holds rounds or zero
|| MVKH 0x0D0D0000,A25
|| MVKH 0x0E0E0000,B25
GMPY4 $K[0],B24,$KxB[0] ; 0x0B
GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
|| GMPY4 $K[1],B24,$KxB[1]
GMPY4 $K[2],B24,$KxB[2]
|| GMPY4 $K[3],B24,$KxB[3]
SPLOOP 11 ; InvMixColumns
;;====================================================================
GMPY4 $K[0],A25,$KxD[0] ; 0x0D
GMPY4 $K[0],A25,$KxD[0] ; ·0x0D
|| GMPY4 $K[1],A25,$KxD[1]
|| SWAP2 $Kx9[0],$Kx9[0] ; rotate by 16
|| SWAP2 $Kx9[1],$Kx9[1]
......@@ -924,7 +924,7 @@ ret?: ; B0 holds rounds or zero
|| [B0] LDW *${KPA}[6],$K[2]
|| [B0] LDW *${KPB}[7],$K[3]
GMPY4 $s[0],B25,$KxE[0] ; 0x0E
GMPY4 $s[0],B25,$KxE[0] ; ·0x0E
|| GMPY4 $s[1],B25,$KxE[1]
|| XOR $Kx9[0],$KxB[0],$KxB[0]
|| XOR $Kx9[1],$KxB[1],$KxB[1]
......@@ -944,7 +944,7 @@ ret?: ; B0 holds rounds or zero
XOR $KxE[0],$KxD[0],$KxE[0]
|| XOR $KxE[1],$KxD[1],$KxE[1]
|| [B0] GMPY4 $K[0],A24,$Kx9[0] ; 0x09
|| [B0] GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
|| [B0] GMPY4 $K[1],A24,$Kx9[1]
|| ADDAW $KPA,4,$KPA
XOR $KxE[2],$KxD[2],$KxE[2]
......@@ -955,7 +955,7 @@ ret?: ; B0 holds rounds or zero
XOR $KxB[0],$KxE[0],$KxE[0]
|| XOR $KxB[1],$KxE[1],$KxE[1]
|| [B0] GMPY4 $K[0],B24,$KxB[0] ; 0x0B
|| [B0] GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
|| [B0] GMPY4 $K[1],B24,$KxB[1]
XOR $KxB[2],$KxE[2],$KxE[2]
|| XOR $KxB[3],$KxE[3],$KxE[3]
......
......@@ -27,7 +27,7 @@
# referred below, which improves ECDH and ECDSA verify benchmarks
# by 18-40%.
#
# Cmara, D.; Gouva, C. P. L.; Lpez, J. & Dahab, R.: Fast Software
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
......@@ -148,7 +148,7 @@ ___
################
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0b1b0
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
{
$code.=<<___;
.global bn_GF2m_mul_2x2
......@@ -171,7 +171,7 @@ $code.=<<___;
mov $mask,#7<<2
sub sp,sp,#32 @ allocate tab[8]
bl mul_1x1_ialu @ a1b1
bl mul_1x1_ialu @ a1·b1
str $lo,[$ret,#8]
str $hi,[$ret,#12]
......@@ -181,13 +181,13 @@ $code.=<<___;
eor r2,r2,$a
eor $b,$b,r3
eor $a,$a,r2
bl mul_1x1_ialu @ a0b0
bl mul_1x1_ialu @ a0·b0
str $lo,[$ret]
str $hi,[$ret,#4]
eor $a,$a,r2
eor $b,$b,r3
bl mul_1x1_ialu @ (a1+a0)(b1+b0)
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
......
......@@ -120,26 +120,26 @@ _bn_GF2m_mul_2x2:
.asmfunc
MVK 0xFF,$xFF
___
&mul_1x1_upper($a0,$b0); # a0b0
&mul_1x1_upper($a0,$b0); # a0·b0
$code.=<<___;
|| MV $b1,$B
MV $a1,$A
___
&mul_1x1_merged("A28","B28",$A,$B); # a0b0/a1b1
&mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1
$code.=<<___;
|| XOR $b0,$b1,$B
XOR $a0,$a1,$A
___
&mul_1x1_merged("A31","B31",$A,$B); # a1b1/(a0+a1)(b0+b1)
&mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1)
$code.=<<___;
XOR A28,A31,A29
|| XOR B28,B31,B29 ; a0b0+a1b1
|| XOR B28,B31,B29 ; a0·b0+a1·b1
___
&mul_1x1_lower("A30","B30"); # (a0+a1)(b0+b1)
&mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1)
$code.=<<___;
|| BNOP B3
XOR A29,A30,A30
|| XOR B29,B30,B30 ; (a0+a1)(b0+b1)-a0b0-a1b1
|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
XOR B28,A30,A30
|| STW A28,*${rp}[0]
XOR B30,A31,A31
......
......@@ -568,7 +568,7 @@ bn_sqr_comba8:
// I've estimated this routine to run in ~120 ticks, but in reality
// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
// cycles consumed for instructions fetch? Or did I misinterpret some
// clause in Itanium µ-architecture manual? Comments are welcomed and
// clause in Itanium µ-architecture manual? Comments are welcomed and
// highly appreciated.
//
// On Itanium 2 it takes ~190 ticks. This is because of stalls on
......
......@@ -172,19 +172,19 @@ ___
if ($SIZE_T==8) {
my @r=map("%r$_",(6..9));
$code.=<<___;
bras $ra,_mul_1x1 # a1b1
bras $ra,_mul_1x1 # a1·b1
stmg $lo,$hi,16($rp)
lg $a,`$stdframe+128+4*$SIZE_T`($sp)
lg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # a0b0
bras $ra,_mul_1x1 # a0·b0
stmg $lo,$hi,0($rp)
lg $a,`$stdframe+128+3*$SIZE_T`($sp)
lg $b,`$stdframe+128+5*$SIZE_T`($sp)
xg $a,`$stdframe+128+4*$SIZE_T`($sp)
xg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # (a0+a1)(b0+b1)
bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
lmg @r[0],@r[3],0($rp)
xgr $lo,$hi
......
......@@ -14,7 +14,7 @@
# the time being... Except that it has three code paths: pure integer
# code suitable for any x86 CPU, MMX code suitable for PIII and later
# and PCLMULQDQ suitable for Westmere and later. Improvement varies
# from one benchmark and -arch to another. Below are interval values
# from one benchmark and µ-arch to another. Below are interval values
# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
# code:
#
......@@ -226,22 +226,22 @@ if ($sse2) {
&push ("edi");
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_mmx"); # a1b1
&call ("_mul_1x1_mmx"); # a1·b1
&movq ("mm7",$R);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # a0b0
&call ("_mul_1x1_mmx"); # a0·b0
&movq ("mm6",$R);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # (a0+a1)(b0+b1)
&call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
&pxor ($R,"mm7");
&mov ($a,&wparam(0));
&pxor ($R,"mm6"); # (a0+a1)(b0+b1)-a1b1-a0b0
&pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
&movq ($A,$R);
&psllq ($R,32);
......@@ -266,13 +266,13 @@ if ($sse2) {
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_ialu"); # a1b1
&call ("_mul_1x1_ialu"); # a1·b1
&mov (&DWP(8,"esp"),$lo);
&mov (&DWP(12,"esp"),$hi);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # a0b0
&call ("_mul_1x1_ialu"); # a0·b0
&mov (&DWP(0,"esp"),$lo);
&mov (&DWP(4,"esp"),$hi);
......@@ -280,7 +280,7 @@ if ($sse2) {
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # (a0+a1)(b0+b1)
&call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
&mov ("ebp",&wparam(0));
@r=("ebx","ecx","edi","esi");
......
......@@ -65,7 +65,7 @@
# undef mul_add
/*-
* "m"(a), "+m"(r) is the way to favor DirectPath -code;
* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
* "g"(0) let the compiler to decide where does it
* want to keep the value of zero;
*/
......
......@@ -13,7 +13,7 @@
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has two code paths: code suitable
# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
# later. Improvement varies from one benchmark and -arch to another.
# later. Improvement varies from one benchmark and µ-arch to another.
# Vanilla code path is at most 20% faster than compiler-generated code
# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
......@@ -184,13 +184,13 @@ ___
$code.=<<___;
movdqa %xmm0,%xmm4
movdqa %xmm1,%xmm5
pclmulqdq \$0,%xmm1,%xmm0 # a1b1
pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
pxor %xmm2,%xmm4
pxor %xmm3,%xmm5
pclmulqdq \$0,%xmm3,%xmm2 # a0b0
pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)(b0+b1)
pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
xorps %xmm0,%xmm4
xorps %xmm2,%xmm4 # (a0+a1)(b0+b1)-a0b0-a1b1
xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
movdqa %xmm4,%xmm5
pslldq \$8,%xmm4
psrldq \$8,%xmm5
......@@ -225,13 +225,13 @@ $code.=<<___;
mov \$0xf,$mask
mov $a1,$a
mov $b1,$b
call _mul_1x1 # a1b1
call _mul_1x1 # a1·b1
mov $lo,16(%rsp)
mov $hi,24(%rsp)
mov 48(%rsp),$a
mov 64(%rsp),$b
call _mul_1x1 # a0b0
call _mul_1x1 # a0·b0
mov $lo,0(%rsp)
mov $hi,8(%rsp)
......@@ -239,7 +239,7 @@ $code.=<<___;
mov 56(%rsp),$b
xor 48(%rsp),$a
xor 64(%rsp),$b
call _mul_1x1 # (a0+a1)(b0+b1)
call _mul_1x1 # (a0+a1)·(b0+b1)
___
@r=("%rbx","%rcx","%rdi","%rsi");
$code.=<<___;
......
......@@ -45,7 +45,7 @@
# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
# Snapdragon S4 - in 9.33.
#
# Cmara, D.; Gouva, C. P. L.; Lpez, J. & Dahab, R.: Fast Software
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
......@@ -449,12 +449,12 @@ gcm_ghash_neon:
veor $IN,$Xl @ inp^=Xi
.Lgmult_neon:
___
&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.loXi.lo
&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
$code.=<<___;
veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
___
&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)(Xi.lo+Xi.hi)
&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hiXi.hi
&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
$code.=<<___;
veor $Xm,$Xm,$Xl @ Karatsuba post-processing
veor $Xm,$Xm,$Xh
......
......@@ -153,7 +153,7 @@ ___
# 8/2 S1 L1x S2 | ....
#####... ................|............
$code.=<<___;
XORMPY $H0,$xia,$H0x ; 0 ; H(Xi[i]<<1)
XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1)
|| XORMPY $H01u,$xib,$H01y
|| [A0] LDBU *--${xip},$x0
XORMPY $H1,$xia,$H1x ; 1
......@@ -162,7 +162,7 @@ $code.=<<___;
XORMPY $H3,$xia,$H3x ; 3
|| XORMPY $H3u,$xib,$H3y
||[!A0] MVK.D 15,A0 ; *--${xip} counter
XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H(Xi[i]<<1)
XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1)
|| [A0] SUB.S A0,1,A0
XOR.L $H1x,$Z1,$Z1 ; 5
|| AND.D $H01y,$FF000000,$H0z
......
......@@ -379,7 +379,7 @@ gcm_init_vis3:
or $V,%lo(0xA0406080),$V
or %l0,%lo(0x20C0E000),%l0
sllx $V,32,$V
or %l0,$V,$V ! (0xE0i)&0xff=0xA040608020C0E000
or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
stx $V,[%i0+16]
ret
......@@ -399,7 +399,7 @@ gcm_gmult_vis3:
mov 0xE1,%l7
sllx %l7,57,$xE1 ! 57 is not a typo
ldx [$Htable+16],$V ! (0xE0i)&0xff=0xA040608020C0E000
ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
xmulx $Xlo,$Hlo,$C0
......@@ -411,9 +411,9 @@ gcm_gmult_vis3:
xmulx $Xhi,$Hhi,$Xhi
sll $C0,3,$sqr
srlx $V,$sqr,$sqr ! 0xE0 [implicit &(7<<3)]
srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
xor $C0,$sqr,$sqr
sllx $sqr,57,$sqr ! ($C00xE1)<<1<<56 [implicit &0x7f]
sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
xor $C0,$C1,$C1 ! Karatsuba post-processing
xor $Xlo,$C2,$C2
......@@ -423,7 +423,7 @@ gcm_gmult_vis3:
xor $Xhi,$C2,$C2
xor $Xhi,$C1,$C1
xmulxhi $C0,$xE1,$Xlo ! 0xE1<<1<<56
xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
xor $C0,$C2,$C2
xmulx $C1,$xE1,$C0
xor $C1,$C3,$C3
......@@ -453,7 +453,7 @@ gcm_ghash_vis3:
mov 0xE1,%l7
sllx %l7,57,$xE1 ! 57 is not a typo
ldx [$Htable+16],$V ! (0xE0i)&0xff=0xA040608020C0E000
ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
and $inp,7,$shl
andn $inp,7,$inp
......@@ -490,9 +490,9 @@ gcm_ghash_vis3:
xmulx $Xhi,$Hhi,$Xhi
sll $C0,3,$sqr
srlx $V,$sqr,$sqr ! 0xE0 [implicit &(7<<3)]
srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
xor $C0,$sqr,$sqr
sllx $sqr,57,$sqr ! ($C00xE1)<<1<<56 [implicit &0x7f]
sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
xor $C0,$C1,$C1 ! Karatsuba post-processing
xor $Xlo,$C2,$C2
......@@ -502,7 +502,7 @@ gcm_ghash_vis3:
xor $Xhi,$C2,$C2
xor $Xhi,$C1,$C1
xmulxhi $C0,$xE1,$Xlo ! 0xE1<<1<<56
xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
xor $C0,$C2,$C2
xmulx $C1,$xE1,$C0
xor $C1,$C3,$C3
......
......@@ -358,7 +358,7 @@ $S=12; # shift factor for rem_4bit
# effective address calculation and finally merge of value to Z.hi.
# Reference to rem_4bit is scheduled so late that I had to >>4
# rem_4bit elements. This resulted in 20-45% procent improvement
# on contemporary µ-archs.
# on contemporary µ-archs.
{
my $cnt;
my $rem_4bit = "eax";
......
......@@ -576,15 +576,15 @@ $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
# experimental alternative. special thing about is that there
# no dependency between the two multiplications...
mov \$`0xE1<<1`,%eax
mov \$0xA040608020C0E000,%r10 # ((7..0)0xE0)&0xff
mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
mov \$0x07,%r11d
movq %rax,$T1
movq %r10,$T2
movq %r11,$T3 # borrow $T3
pand $Xi,$T3
pshufb $T3,$T2 # ($Xi&7)0xE0
pshufb $T3,$T2 # ($Xi&7)·0xE0
movq %rax,$T3
pclmulqdq \$0x00,$Xi,$T1 # (0xE1<<1)
pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
pxor $Xi,$T2
pslldq \$15,$T2
paddd $T2,$T2 # <<(64+56+1)
......@@ -657,7 +657,7 @@ $code.=<<___;
je .Lskip4x
sub \$0x30,$len
mov \$0xA040608020C0E000,%rax # ((7..0)0xE0)&0xff
mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
movdqu 0x30($Htbl),$Hkey3
movdqu 0x40($Htbl),$Hkey4
......
......@@ -118,9 +118,9 @@ $code=<<___;
le?vperm $IN,$IN,$IN,$lemask
vxor $zero,$zero,$zero
vpmsumd $Xl,$IN,$Hl # H.loXi.lo
vpmsumd $Xm,$IN,$H # H.hiXi.lo+H.loXi.hi
vpmsumd $Xh,$IN,$Hh # H.hiXi.hi
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
vpmsumd $t2,$Xl,$xC2 # 1st phase
......@@ -178,11 +178,11 @@ $code=<<___;
.align 5
Loop:
subic $len,$len,16
vpmsumd $Xl,$IN,$Hl # H.loXi.lo
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
subfe. r0,r0,r0 # borrow?-1:0
vpmsumd $Xm,$IN,$H # H.hiXi.lo+H.loXi.hi
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
and r0,r0,$len
vpmsumd $Xh,$IN,$Hh # H.hiXi.hi
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
add $inp,$inp,r0
vpmsumd $t2,$Xl,$xC2 # 1st phase
......
......@@ -144,10 +144,10 @@ gcm_gmult_v8:
#endif
vext.8 $IN,$t1,$t1,#8
vpmull.p64 $Xl,$H,$IN @ H.loXi.lo
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hiXi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)(Xi.lo+Xi.hi)
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
......@@ -235,7 +235,7 @@ $code.=<<___;
#endif
vext.8 $In,$t1,$t1,#8
veor $IN,$IN,$Xl @ I[i]^=Xi
vpmull.p64 $Xln,$H,$In @ HIi+1
vpmull.p64 $Xln,$H,$In @ H·Ii+1
veor $t1,$t1,$In @ Karatsuba pre-processing
vpmull2.p64 $Xhn,$H,$In
b .Loop_mod2x_v8
......@@ -244,14 +244,14 @@ $code.=<<___;
.Loop_mod2x_v8:
vext.8 $t2,$IN,$IN,#8
subs $len,$len,#32 @ is there more data?
vpmull.p64 $Xl,$H2,$IN @ H^2.loXi.lo
vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
cclr $inc,lo @ is it time to zero $inc?
vpmull.p64 $Xmn,$Hhl,$t1
veor $t2,$t2,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H2,$IN @ H^2.hiXi.hi
vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
veor $Xl,$Xl,$Xln @ accumulate
vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)(Xi.lo+Xi.hi)
vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
veor $Xh,$Xh,$Xhn
......@@ -276,7 +276,7 @@ $code.=<<___;
vext.8 $In,$t1,$t1,#8
vext.8 $IN,$t0,$t0,#8
veor $Xl,$Xm,$t2
vpmull.p64 $Xln,$H,$In @ HIi+1
vpmull.p64 $Xln,$H,$In @ H·Ii+1
veor $IN,$IN,$Xh @ accumulate $IN early
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
......@@ -300,10 +300,10 @@ $code.=<<___;
veor $IN,$IN,$Xl @ inp^=Xi
veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
vpmull.p64 $Xl,$H,$IN @ H.loXi.lo
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hiXi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)(Xi.lo+Xi.hi)
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
......
......@@ -44,7 +44,7 @@
# Sandy Bridge 5.0/+8%
# Atom 12.6/+6%
# VIA Nano 6.4/+9%
# Ivy Bridge 4.9/0%
# Ivy Bridge 4.9/±0%
# Bulldozer 4.9/+15%
#
# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
......
......@@ -56,7 +56,7 @@
# achieves respectful 432MBps on 2.8GHz processor now. For reference.
# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
# RC4_INT code-path. While if executed on Opteron, it's only 25%
# slower than the RC4_INT one [meaning that if CPU -arch detection
# slower than the RC4_INT one [meaning that if CPU µ-arch detection
# is not implemented, then this final RC4_CHAR code-path should be
# preferred, as it provides better *all-round* performance].
......
......@@ -66,9 +66,9 @@
# switch to AVX alone improves performance by as little as 4% in
# comparison to SSSE3 code path. But below result doesn't look like
# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
# pair of µ-ops, and it's the additional µ-ops, two per round, that
# pair of µ-ops, and it's the additional µ-ops, two per round, that
# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
# cycles per processed byte. But 'sh[rl]d' is not something that used
# to be fast, nor does it appear to be fast in upcoming Bulldozer
......
......@@ -10,7 +10,7 @@
# SHA256 block transform for x86. September 2007.
#
# Performance improvement over compiler generated code varies from
# 10% to 40% [see below]. Not very impressive on some -archs, but
# 10% to 40% [see below]. Not very impressive on some µ-archs, but
# it's 5 times smaller and optimizies amount of writes.
#
# May 2012.
......
......@@ -37,7 +37,7 @@
#
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
# performance improvement over compiler generated code reaches ~60%,
# while on PIII - ~35%. On newer -archs improvement varies from 15%
# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
# to 50%, but it's less important as they are expected to execute SSE2
# code-path, which is commonly ~2-3x faster [than compiler generated
# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
......
......@@ -127,7 +127,7 @@ OPENSSL_wipe_cpu:
fmovs %f1,%f3
fmovs %f0,%f2
add %fp,BIAS,%i0 ! return pointer to callers top of stack
add %fp,BIAS,%i0 ! return pointer to caller´s top of stack
ret
restore
......
......@@ -16,7 +16,7 @@
# table]. I stick to value of 2 for two reasons: 1. smaller table
# minimizes cache trashing and thus mitigates the hazard of side-
# channel leakage similar to AES cache-timing one; 2. performance
# gap among different -archs is smaller.
# gap among different µ-archs is smaller.
#
# Performance table lists rounded amounts of CPU cycles spent by
# whirlpool_block_mmx routine on single 64 byte input block, i.e.
......
......@@ -3,7 +3,7 @@
* Contributed to the OpenSSL Project 2004 by Richard Levitte
* (richard@levitte.org)
*/
/* Copyright (c) 2004 Kungliga Tekniska Hgskolan
/* Copyright (c) 2004 Kungliga Tekniska Högskolan
* (Royal Institute of Technology, Stockholm, Sweden).
* All rights reserved.
*
......
......@@ -3,7 +3,7 @@
* Contributed to the OpenSSL Project 2004 by Richard Levitte
* (richard@levitte.org)
*/
/* Copyright (c) 2004 Kungliga Tekniska Högskolan
/* Copyright (c) 2004 Kungliga Tekniska Högskolan
* (Royal Institute of Technology, Stockholm, Sweden).
* All rights reserved.
*
......
......@@ -62,4 +62,4 @@ As noted above, easy_tls.c will be changed to become a library one
day, which means that future revisions will not be fully compatible to
the current version.
Bodo Möller <bodo@openssl.org>
Bodo Möller <bodo@openssl.org>
......@@ -57,7 +57,7 @@ BEGIN
VALUE "ProductVersion", "$version\\0"
// Optional:
//VALUE "Comments", "\\0"
VALUE "LegalCopyright", "Copyright 1998-2006 The OpenSSL Project. Copyright 1995-1998 Eric A. Young, Tim J. Hudson. All rights reserved.\\0"
VALUE "LegalCopyright", "Copyright © 1998-2006 The OpenSSL Project. Copyright © 1995-1998 Eric A. Young, Tim J. Hudson. All rights reserved.\\0"
//VALUE "LegalTrademarks", "\\0"
//VALUE "PrivateBuild", "\\0"
//VALUE "SpecialBuild", "\\0"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册