Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
273a8081
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
1 年多 前同步成功
通知
9
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
273a8081
编写于
2月 14, 2013
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
ghash-x86[_64].pl: code refresh.
上级
7c9e81be
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
499 addition
and
154 deletion
+499
-154
crypto/modes/asm/ghash-x86.pl
crypto/modes/asm/ghash-x86.pl
+104
-61
crypto/modes/asm/ghash-x86_64.pl
crypto/modes/asm/ghash-x86_64.pl
+379
-93
crypto/modes/gcm128.c
crypto/modes/gcm128.c
+16
-0
未找到文件。
crypto/modes/asm/ghash-x86.pl
浏览文件 @
273a8081
...
...
@@ -119,6 +119,12 @@
# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
# 32-bit mode and 1.89 in 64-bit.
# February 2013
#
# Overhaul: aggregate Karatsuba post-processing, improve ILP in
# reduction_alg9. Resulting performance is 1.96 cycles per byte on
# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
$
0
=~
m/(.*[\/\\])[^\/\\]+$/
;
$dir
=
$
1
;
push
(
@INC
,"
${dir}
","
${dir}
../../perlasm
");
require
"
x86asm.pl
";
...
...
@@ -828,17 +834,18 @@ $len="ebx";
&static_label
("
bswap
");
sub
clmul64x64_T2
{
# minimal "register" pressure
my
(
$Xhi
,
$Xi
,
$Hkey
)
=
@_
;
my
(
$Xhi
,
$Xi
,
$Hkey
,
$HK
)
=
@_
;
&movdqa
(
$Xhi
,
$Xi
);
#
&pshufd
(
$T1
,
$Xi
,
0b01001110
);
&pshufd
(
$T2
,
$Hkey
,
0b01001110
);
&pshufd
(
$T2
,
$Hkey
,
0b01001110
)
if
(
!
defined
(
$HK
))
;
&pxor
(
$T1
,
$Xi
);
#
&pxor
(
$T2
,
$Hkey
);
&pxor
(
$T2
,
$Hkey
)
if
(
!
defined
(
$HK
));
$HK
=
$T2
if
(
!
defined
(
$HK
));
&pclmulqdq
(
$Xi
,
$Hkey
,
0x00
);
#######
&pclmulqdq
(
$Xhi
,
$Hkey
,
0x11
);
#######
&pclmulqdq
(
$T1
,
$
T2
,
0x00
);
#######
&pclmulqdq
(
$T1
,
$
HK
,
0x00
);
#######
&xorps
(
$T1
,
$Xi
);
#
&xorps
(
$T1
,
$Xhi
);
#
...
...
@@ -885,31 +892,32 @@ if (1) { # Algorithm 9 with <<1 twist.
# below. Algorithm 9 was therefore chosen for
# further optimization...
sub
reduction_alg9
{
# 17/1
3
times faster than Intel version
sub
reduction_alg9
{
# 17/1
1
times faster than Intel version
my
(
$Xhi
,
$Xi
)
=
@_
;
# 1st phase
&movdqa
(
$T1
,
$Xi
);
#
&movdqa
(
$T2
,
$Xi
);
#
&movdqa
(
$T1
,
$Xi
);
&psllq
(
$Xi
,
5
);
&pxor
(
$T1
,
$Xi
);
#
&psllq
(
$Xi
,
1
);
&pxor
(
$Xi
,
$T1
);
#
&psllq
(
$Xi
,
5
);
#
&pxor
(
$Xi
,
$T1
);
#
&psllq
(
$Xi
,
57
);
#
&movdqa
(
$T
2
,
$Xi
);
#
&movdqa
(
$T
1
,
$Xi
);
#
&pslldq
(
$Xi
,
8
);
&psrldq
(
$T
2
,
8
);
#
&pxor
(
$Xi
,
$T
1
);
&pxor
(
$Xhi
,
$T
2
);
#
&psrldq
(
$T
1
,
8
);
#
&pxor
(
$Xi
,
$T
2
);
&pxor
(
$Xhi
,
$T
1
);
#
# 2nd phase
&movdqa
(
$T2
,
$Xi
);
&psrlq
(
$Xi
,
1
);
&pxor
(
$Xhi
,
$T2
);
#
&pxor
(
$T2
,
$Xi
);
&psrlq
(
$Xi
,
5
);
&pxor
(
$Xi
,
$T2
);
#
&psrlq
(
$Xi
,
1
);
#
&pxor
(
$Xi
,
$T2
);
#
&pxor
(
$T2
,
$Xhi
);
&psrlq
(
$Xi
,
1
);
#
&pxor
(
$Xi
,
$T2
);
#
&pxor
(
$Xi
,
$Xhi
)
#
}
&function_begin_B
("
gcm_init_clmul
");
...
...
@@ -943,8 +951,14 @@ my ($Xhi,$Xi) = @_;
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
&reduction_alg9
(
$Xhi
,
$Xi
);
&pshufd
(
$T1
,
$Hkey
,
0b01001110
);
&pshufd
(
$T2
,
$Xi
,
0b01001110
);
&pxor
(
$T1
,
$Hkey
);
# Karatsuba pre-processing
&movdqu
(
&QWP
(
0
,
$Htbl
),
$Hkey
);
# save H
&pxor
(
$T2
,
$Xi
);
# Karatsuba pre-processing
&movdqu
(
&QWP
(
16
,
$Htbl
),
$Xi
);
# save H^2
&palignr
(
$T2
,
$T1
,
8
);
# low part is H.lo^H.hi
&movdqu
(
&QWP
(
32
,
$Htbl
),
$T2
);
# save Karatsuba "salt"
&ret
();
&function_end_B
("
gcm_init_clmul
");
...
...
@@ -962,8 +976,9 @@ my ($Xhi,$Xi) = @_;
&movdqa
(
$T3
,
&QWP
(
0
,
$const
));
&movups
(
$Hkey
,
&QWP
(
0
,
$Htbl
));
&pshufb
(
$Xi
,
$T3
);
&movups
(
$T2
,
&QWP
(
32
,
$Htbl
));
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
,
$T2
);
&reduction_alg9
(
$Xhi
,
$Xi
);
&pshufb
(
$Xi
,
$T3
);
...
...
@@ -1000,79 +1015,107 @@ my ($Xhi,$Xi) = @_;
&movdqu
(
$Xn
,
&QWP
(
16
,
$inp
));
# Ii+1
&pshufb
(
$T1
,
$T3
);
&pshufb
(
$Xn
,
$T3
);
&movdqu
(
$T3
,
&QWP
(
32
,
$Htbl
));
&pxor
(
$Xi
,
$T1
);
# Ii+Xi
&clmul64x64_T2
(
$Xhn
,
$Xn
,
$Hkey
);
# H*Ii+1
&pshufd
(
$T1
,
$Xn
,
0b01001110
);
# H*Ii+1
&movdqa
(
$Xhn
,
$Xn
);
&pxor
(
$T1
,
$Xn
);
#
&pclmulqdq
(
$Xn
,
$Hkey
,
0x00
);
#######
&pclmulqdq
(
$Xhn
,
$Hkey
,
0x11
);
#######
&movups
(
$Hkey
,
&QWP
(
16
,
$Htbl
));
# load H^2
&pclmulqdq
(
$T1
,
$T3
,
0x00
);
#######
&lea
(
$inp
,
&DWP
(
32
,
$inp
));
# i+=2
&sub
(
$len
,
0x20
);
&jbe
(
&label
("
even_tail
"));
&jmp
(
&label
("
mod_loop
"));
&set_label
("
mod_loop
");
&
clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
# H^2*(Ii+Xi)
&movdq
u
(
$T1
,
&QWP
(
0
,
$inp
));
# Ii
&
movups
(
$Hkey
,
&QWP
(
0
,
$Htbl
));
# load H
&set_label
("
mod_loop
"
,
32
);
&
pshufd
(
$T2
,
$Xi
,
0b01001110
);
# H^2*(Ii+Xi)
&movdq
a
(
$Xhi
,
$Xi
);
&
pxor
(
$T2
,
$Xi
);
#
&pxor
(
$Xi
,
$Xn
);
# (H*Ii+1) + H^2*(Ii+Xi)
&pxor
(
$Xhi
,
$Xhn
);
&pclmulqdq
(
$Xi
,
$Hkey
,
0x00
);
#######
&pclmulqdq
(
$Xhi
,
$Hkey
,
0x11
);
#######
&movups
(
$Hkey
,
&QWP
(
0
,
$Htbl
));
# load H
&pclmulqdq
(
$T2
,
$T3
,
0x10
);
#######
&movdqa
(
$T3
,
&QWP
(
0
,
$const
));
&movdqu
(
$Xn
,
&QWP
(
16
,
$inp
));
# Ii+1
&pshufb
(
$T1
,
$T3
);
&pshufb
(
$Xn
,
$T3
);
&xorps
(
$Xi
,
$Xn
);
# (H*Ii+1) + H^2*(Ii+Xi)
&xorps
(
$Xhi
,
$Xhn
);
&movdqu
(
$Xhn
,
&QWP
(
0
,
$inp
));
# Ii
&pxor
(
$T1
,
$Xi
);
# aggregated Karatsuba post-processing
&movdqu
(
$Xn
,
&QWP
(
16
,
$inp
));
# Ii+1
&pxor
(
$T1
,
$Xhi
);
#
&movdqa
(
$T3
,
$Xn
);
#&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
&movdqa
(
$Xhn
,
$Xn
);
&pxor
(
$Xhi
,
$T1
);
# "Ii+Xi", consume early
&pxor
(
$T2
,
$T1
);
#
&pshufb
(
$Xhn
,
$T3
);
&movdqa
(
$T1
,
$Xi
);
#&reduction_alg9($Xhi,$Xi); 1st phase
&movdqa
(
$T1
,
$T2
);
#
&psrldq
(
$T2
,
8
);
&pslldq
(
$T1
,
8
);
#
&pxor
(
$Xhi
,
$T2
);
&pxor
(
$Xi
,
$T1
);
#
&pshufb
(
$Xn
,
$T3
);
&pxor
(
$Xhi
,
$Xhn
);
# "Ii+Xi", consume early
&movdqa
(
$Xhn
,
$Xn
);
#&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
&movdqa
(
$T2
,
$Xi
);
#&reduction_alg9($Xhi,$Xi); 1st phase
&movdqa
(
$T1
,
$Xi
);
&psllq
(
$Xi
,
5
);
&pxor
(
$T1
,
$Xi
);
#
&psllq
(
$Xi
,
1
);
&pxor
(
$Xi
,
$T1
);
#
&psllq
(
$Xi
,
5
);
#
&pxor
(
$Xi
,
$T1
);
#
&movups
(
$T3
,
&QWP
(
32
,
$Htbl
));
&pclmulqdq
(
$Xn
,
$Hkey
,
0x00
);
#######
&psllq
(
$Xi
,
57
);
#
&movdqa
(
$T
2
,
$Xi
);
#
&movdqa
(
$T
1
,
$Xi
);
#
&pslldq
(
$Xi
,
8
);
&psrldq
(
$T2
,
8
);
#
&pxor
(
$Xi
,
$T1
);
&pshufd
(
$T1
,
$T3
,
0b01001110
);
&pxor
(
$Xhi
,
$T2
);
#
&pxor
(
$T1
,
$T3
);
&pshufd
(
$T3
,
$Hkey
,
0b01001110
);
&pxor
(
$T3
,
$Hkey
);
#
&pclmulqdq
(
$Xhn
,
$Hkey
,
0x11
);
#######
&psrldq
(
$T1
,
8
);
#
&pxor
(
$Xi
,
$T2
);
&pxor
(
$Xhi
,
$T1
);
#
&pshufd
(
$T1
,
$Xhn
,
0b01001110
);
&movdqa
(
$T2
,
$Xi
);
# 2nd phase
&psrlq
(
$Xi
,
1
);
&pxor
(
$T1
,
$Xhn
);
&pclmulqdq
(
$Xhn
,
$Hkey
,
0x11
);
#######
&movups
(
$Hkey
,
&QWP
(
16
,
$Htbl
));
# load H^2
&pxor
(
$Xhi
,
$T2
);
#
&pxor
(
$T2
,
$Xi
);
&psrlq
(
$Xi
,
5
);
&pxor
(
$Xi
,
$T2
);
#
&psrlq
(
$Xi
,
1
);
#
&pxor
(
$Xi
,
$T2
);
#
&pxor
(
$T2
,
$Xhi
);
&psrlq
(
$Xi
,
1
);
#
&pxor
(
$Xi
,
$T2
);
#
&pxor
(
$Xi
,
$Xhi
)
#
&pclmulqdq
(
$T1
,
$T3
,
0x00
);
#######
&movups
(
$Hkey
,
&QWP
(
16
,
$Htbl
));
# load H^2
&xorps
(
$T1
,
$Xn
);
#
&xorps
(
$T1
,
$Xhn
);
#
&movdqa
(
$T3
,
$T1
);
#
&psrldq
(
$T1
,
8
);
&pslldq
(
$T3
,
8
);
#
&pxor
(
$Xhn
,
$T1
);
&pxor
(
$Xn
,
$T3
);
#
&movdqa
(
$T3
,
&QWP
(
0
,
$const
));
&lea
(
$inp
,
&DWP
(
32
,
$inp
));
&sub
(
$len
,
0x20
);
&ja
(
&label
("
mod_loop
"));
&set_label
("
even_tail
");
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
# H^2*(Ii+Xi)
&pshufd
(
$T2
,
$Xi
,
0b01001110
);
# H^2*(Ii+Xi)
&movdqa
(
$Xhi
,
$Xi
);
&pxor
(
$T2
,
$Xi
);
#
&pxor
(
$Xi
,
$Xn
);
# (H*Ii+1) + H^2*(Ii+Xi)
&pxor
(
$Xhi
,
$Xhn
);
&pclmulqdq
(
$Xi
,
$Hkey
,
0x00
);
#######
&pclmulqdq
(
$Xhi
,
$Hkey
,
0x11
);
#######
&pclmulqdq
(
$T2
,
$T3
,
0x10
);
#######
&movdqa
(
$T3
,
&QWP
(
0
,
$const
));
&xorps
(
$Xi
,
$Xn
);
# (H*Ii+1) + H^2*(Ii+Xi)
&xorps
(
$Xhi
,
$Xhn
);
&pxor
(
$T1
,
$Xi
);
# aggregated Karatsuba post-processing
&pxor
(
$T1
,
$Xhi
);
#
&pxor
(
$T2
,
$T1
);
#
&movdqa
(
$T1
,
$T2
);
#
&psrldq
(
$T2
,
8
);
&pslldq
(
$T1
,
8
);
#
&pxor
(
$Xhi
,
$T2
);
&pxor
(
$Xi
,
$T1
);
#
&reduction_alg9
(
$Xhi
,
$Xi
);
...
...
crypto/modes/asm/ghash-x86_64.pl
浏览文件 @
273a8081
...
...
@@ -41,6 +41,29 @@
# providing access to a Westmere-based system on behalf of Intel
# Open Source Technology Centre.
# December 2012
#
# Overhaul: aggregate Karatsuba post-processing, improve ILP in
# reduction_alg9, increase reduction aggregate factor to 4x. As for
# the latter. ghash-x86.pl discusses that it makes lesser sense to
# increase aggregate factor. Then why increase here? Critical path
# consists of 3 independent pclmulqdq instructions, Karatsuba post-
# processing and reduction. "On top" of this we lay down aggregated
# multiplication operations, triplets of independent pclmulqdq's. As
# issue rate for pclmulqdq is limited, it makes lesser sense to
# aggregate more multiplications than it takes to perform remaining
# non-multiplication operations. 2x is near-optimal coefficient for
# contemporary Intel CPUs (therefore modest improvement coefficient),
# but not for Bulldozer. Latter is because logical SIMD operations
# are twice as slow in comparison to Intel, so that critical path is
# longer. A CPU with higher pclmulqdq issue rate would also benefit
# from higher aggregate factor...
#
# Westmere 1.76(+14%)
# Sandy Bridge 1.79(+9%)
# Ivy Bridge 1.79(+8%)
# Bulldozer 1.52(+25%)
$flavour
=
shift
;
$output
=
shift
;
if
(
$flavour
=~
/\./
)
{
$output
=
$flavour
;
undef
$flavour
;
}
...
...
@@ -55,6 +78,8 @@ die "can't locate x86_64-xlate.pl";
open
OUT
,"
|
\"
$^X
\"
$xlate
$flavour
$output
";
*STDOUT
=
*OUT
;
$do4xaggr
=
1
;
# common register layout
$nlo
=
"
%rax
";
$nhi
=
"
%rbx
";
...
...
@@ -354,19 +379,27 @@ ___
(
$T1
,
$T2
,
$T3
)
=
("
%xmm3
","
%xmm4
","
%xmm5
");
sub
clmul64x64_T2
{
# minimal register pressure
my
(
$Xhi
,
$Xi
,
$Hkey
,
$
modulo
)
=
@_
;
my
(
$Xhi
,
$Xi
,
$Hkey
,
$
HK
)
=
@_
;
$code
.=<<
___
if
(
!
defined
(
$modulo
));
if
(
!
defined
(
$HK
))
{
$HK
=
$T2
;
$code
.=
<<___;
movdqa $Xi,$Xhi #
pshufd \$0b01001110,$Xi,$T1
pshufd \$0b01001110,$Hkey,$T2
pxor $Xi,$T1 #
pxor $Hkey,$T2
___
}
else
{
$code
.=
<<___;
movdqa $Xi,$Xhi #
pshufd \$0b01001110,$Xi,$T1
pxor $Xi,$T1 #
___
}
$code
.=
<<___;
pclmulqdq \$0x00,$Hkey,$Xi #######
pclmulqdq \$0x11,$Hkey,$Xhi #######
pclmulqdq \$0x00,$
T2
,$T1 #######
pclmulqdq \$0x00,$
HK
,$T1 #######
pxor $Xi,$T1 #
pxor $Xhi,$T1 #
...
...
@@ -378,32 +411,33 @@ $code.=<<___;
___
}
sub
reduction_alg9
{
# 17/1
3
times faster than Intel version
sub
reduction_alg9
{
# 17/1
1
times faster than Intel version
my
(
$Xhi
,
$Xi
)
=
@_
;
$code
.=
<<___;
# 1st phase
movdqa $Xi,$T1 #
movdqa $Xi,$T2 #
movdqa $Xi,$T1
psllq \$5,$Xi
pxor $Xi,$T1 #
psllq \$1,$Xi
pxor $T1,$Xi #
psllq \$5,$Xi #
pxor $T1,$Xi #
psllq \$57,$Xi #
movdqa $Xi,$T
2
#
movdqa $Xi,$T
1
#
pslldq \$8,$Xi
psrldq \$8,$T
2
#
pxor $T
1
,$Xi
pxor $T
2
,$Xhi #
psrldq \$8,$T
1
#
pxor $T
2
,$Xi
pxor $T
1
,$Xhi #
# 2nd phase
movdqa $Xi,$T2
psrlq \$1,$Xi
pxor $T2,$Xhi #
pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
psrlq \$1,$Xi #
pxor $T2,$Xi #
pxor $Xhi,$T2
psrlq \$1,$Xi #
pxor $T2,$Xi #
pxor $Xhi,$Xi #
___
}
...
...
@@ -437,8 +471,35 @@ ___
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
&reduction_alg9
(
$Xhi
,
$Xi
);
$code
.=
<<___;
movdqu $Hkey,($Htbl) # save H
movdqu $Xi,16($Htbl) # save H^2
pshufd \$0b01001110,$Hkey,$T1
pshufd \$0b01001110,$Xi,$T2
pxor $Hkey,$T1 # Karatsuba pre-processing
movdqu $Hkey,0x00($Htbl) # save H
pxor $Xi,$T2 # Karatsuba pre-processing
movdqu $Xi,0x10($Htbl) # save H^2
palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
___
if
(
$do4xaggr
)
{
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
# H^3
&reduction_alg9
(
$Xhi
,
$Xi
);
$code
.=
<<___;
movdqa $Xi,$T3
___
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
# H^4
&reduction_alg9
(
$Xhi
,
$Xi
);
$code
.=
<<___;
pshufd \$0b01001110,$T3,$T1
pshufd \$0b01001110,$Xi,$T2
pxor $T3,$T1 # Karatsuba pre-processing
movdqu $T3,0x30($Htbl) # save H^3
pxor $Xi,$T2 # Karatsuba pre-processing
movdqu $Xi,0x40($Htbl) # save H^4
palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
___
}
$code
.=
<<___;
ret
.size gcm_init_clmul,.-gcm_init_clmul
___
...
...
@@ -454,10 +515,34 @@ gcm_gmult_clmul:
movdqu ($Xip),$Xi
movdqa .Lbswap_mask(%rip),$T3
movdqu ($Htbl),$Hkey
movdqu 0x20($Htbl),$T2
pshufb $T3,$Xi
___
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
&reduction_alg9
(
$Xhi
,
$Xi
);
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
,
$T2
);
$code
.=<<
___
if
(
0
||
(
&reduction_alg9
(
$Xhi
,
$Xi
)
&&
0
));
# experimental alternative. special thing about is that there
# no dependency between the two multiplications...
mov
\
$
`
0xE1<<1
`,
%eax
mov
\
$
0xA040608020C0E000
,
%r10
# ((7..0)0xE0)&0xff
mov
\
$
0x07
,
%r11d
movq
%rax
,
$T1
movq
%r10
,
$T2
movq
%r11
,
$T3
# borrow $T3
pand
$Xi
,
$T3
pshufb
$T3
,
$T2
# ($Xi&7)0xE0
movq
%rax
,
$T3
pclmulqdq
\
$
0x00
,
$Xi
,
$T1
# (0xE1<<1)
pxor
$Xi
,
$T2
pslldq
\
$
15
,
$T2
paddd
$T2
,
$T2
# <<(64+56+1)
pxor
$T2
,
$Xi
pclmulqdq
\
$
0x01
,
$T3
,
$Xi
movdqa
.
Lbswap_mask
(
%rip
),
$T3
# reload $T3
psrldq
\
$
1
,
$T1
pxor
$T1
,
$Xhi
pslldq
\
$
7
,
$Xi
pxor
$Xhi
,
$Xi
___
$code
.=
<<___;
pshufb $T3,$Xi
movdqu $Xi,($Xip)
...
...
@@ -467,129 +552,316 @@ ___
}
{
my
(
$Xip
,
$Htbl
,
$inp
,
$len
)
=
@
_4args
;
my
$Xn
=
"
%xmm6
";
my
$Xhn
=
"
%xmm7
";
my
$Hkey2
=
"
%xmm8
";
my
$T1n
=
"
%xmm9
";
my
$T2n
=
"
%xmm10
";
my
(
$Xln
,
$Xmn
,
$Xhn
,
$Hkey2
,
$HK
)
=
map
("
%xmm
$_
",(
6
..
10
));
$code
.=
<<___;
.globl gcm_ghash_clmul
.type gcm_ghash_clmul,\@abi-omnipotent
.align
16
.align
32
gcm_ghash_clmul:
___
$code
.=<<
___
if
(
$win64
);
lea
-
0x88
(
%rsp
),
%rax
.
LSEH_begin_gcm_ghash_clmul:
# I can't trust assembler to use specific encoding:-(
.
byte
0x48
,
0x83
,
0xec
,
0x58
#sub \$0x58,%rsp
.
byte
0x0f
,
0x29
,
0x34
,
0x24
#movaps %xmm6,(%rsp)
.
byte
0x0f
,
0x29
,
0x7c
,
0x24
,
0x10
#movdqa %xmm7,0x10(%rsp)
.
byte
0x44
,
0x0f
,
0x29
,
0x44
,
0x24
,
0x20
#movaps %xmm8,0x20(%rsp)
.
byte
0x44
,
0x0f
,
0x29
,
0x4c
,
0x24
,
0x30
#movaps %xmm9,0x30(%rsp)
.
byte
0x44
,
0x0f
,
0x29
,
0x54
,
0x24
,
0x40
#movaps %xmm10,0x40(%rsp)
.
byte
0x48
,
0x8d
,
0x60
,
0xe0
#lea -0x20(%rax),%rsp
.
byte
0x0f
,
0x29
,
0x70
,
0xe0
#movaps %xmm6,-0x20(%rax)
.
byte
0x0f
,
0x29
,
0x78
,
0xf0
#movaps %xmm7,-0x10(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x00
#movaps %xmm8,0(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x48
,
0x10
#movaps %xmm9,0x10(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x50
,
0x20
#movaps %xmm10,0x20(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x58
,
0x30
#movaps %xmm11,0x30(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x60
,
0x40
#movaps %xmm12,0x40(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x68
,
0x50
#movaps %xmm13,0x50(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x70
,
0x60
#movaps %xmm14,0x60(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x78
,
0x70
#movaps %xmm15,0x70(%rax)
___
$code
.=
<<___;
movdqa .Lbswap_mask(%rip),$T3
mov \$0xA040608020C0E000,%rax # ((7..0)0xE0)&0xff
movdqu ($Xip),$Xi
movdqu ($Htbl),$Hkey
movdqu 0x20($Htbl),$HK
pshufb $T3,$Xi
sub \$0x10,$len
jz .Lodd_tail
movdqu 16($Htbl),$Hkey2
movdqu 0x10($Htbl),$Hkey2
___
if
(
$do4xaggr
)
{
my
(
$Xl
,
$Xm
,
$Xh
,
$Hkey3
,
$Hkey4
)
=
map
("
%xmm
$_
",(
11
..
15
));
$code
.=
<<___;
cmp \$0x30,$len
jb .Lskip4x
sub \$0x30,$len
movdqu 0x30($Htbl),$Hkey3
movdqu 0x40($Htbl),$Hkey4
#######
# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
#
movdqu 0x30($inp),$Xln
movdqu 0x20($inp),$Xl
pshufb $T3,$Xln
pshufb $T3,$Xl
movdqa $Xln,$Xhn
pshufd \$0b01001110,$Xln,$Xmn
pxor $Xln,$Xmn
pclmulqdq \$0x00,$Hkey,$Xln
pclmulqdq \$0x11,$Hkey,$Xhn
pclmulqdq \$0x00,$HK,$Xmn
movdqa $Xl,$Xh
pshufd \$0b01001110,$Xl,$Xm
pxor $Xl,$Xm
pclmulqdq \$0x00,$Hkey2,$Xl
pclmulqdq \$0x11,$Hkey2,$Xh
xorps $Xl,$Xln
pclmulqdq \$0x10,$HK,$Xm
xorps $Xh,$Xhn
movups 0x50($Htbl),$HK
xorps $Xm,$Xmn
movdqu 0x10($inp),$Xl
movdqu 0($inp),$T1
pshufb $T3,$Xl
pshufb $T3,$T1
movdqa $Xl,$Xh
pshufd \$0b01001110,$Xl,$Xm
pxor $T1,$Xi
pxor $Xl,$Xm
pclmulqdq \$0x00,$Hkey3,$Xl
movdqa $Xi,$Xhi
pshufd \$0b01001110,$Xi,$T1
pxor $Xi,$T1
pclmulqdq \$0x11,$Hkey3,$Xh
xorps $Xl,$Xln
pclmulqdq \$0x00,$HK,$Xm
xorps $Xh,$Xhn
lea 0x40($inp),$inp
sub \$0x40,$len
jc .Ltail4x
jmp .Lmod4_loop
.align 32
.Lmod4_loop:
pclmulqdq \$0x00,$Hkey4,$Xi
xorps $Xm,$Xmn
movdqu 0x30($inp),$Xl
pshufb $T3,$Xl
pclmulqdq \$0x11,$Hkey4,$Xhi
xorps $Xln,$Xi
movdqu 0x20($inp),$Xln
movdqa $Xl,$Xh
pshufd \$0b01001110,$Xl,$Xm
pclmulqdq \$0x10,$HK,$T1
xorps $Xhn,$Xhi
pxor $Xl,$Xm
pshufb $T3,$Xln
movups 0x20($Htbl),$HK
pclmulqdq \$0x00,$Hkey,$Xl
xorps $Xmn,$T1
movdqa $Xln,$Xhn
pshufd \$0b01001110,$Xln,$Xmn
pxor $Xi,$T1 # aggregated Karatsuba post-processing
pxor $Xln,$Xmn
pxor $Xhi,$T1 #
movdqa $T1,$T2 #
pslldq \$8,$T1
pclmulqdq \$0x11,$Hkey,$Xh
psrldq \$8,$T2 #
pxor $T1,$Xi
movdqa .L7_mask(%rip),$T1
pxor $T2,$Xhi #
movq %rax,$T2
pand $Xi,$T1 # 1st phase
pshufb $T1,$T2 #
pclmulqdq \$0x00,$HK,$Xm
pxor $Xi,$T2 #
psllq \$57,$T2 #
movdqa $T2,$T1 #
pslldq \$8,$T2
pclmulqdq \$0x00,$Hkey2,$Xln
psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
movdqu 0($inp),$T1
movdqa $Xi,$T2 # 2nd phase
psrlq \$1,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhn
xorps $Xl,$Xln
movdqu 0x10($inp),$Xl
pshufb $T3,$Xl
pclmulqdq \$0x10,$HK,$Xmn
xorps $Xh,$Xhn
movups 0x50($Htbl),$HK
pshufb $T3,$T1
pxor $T2,$Xhi #
pxor $Xi,$T2
psrlq \$5,$Xi
movdqa $Xl,$Xh
pxor $Xm,$Xmn
pshufd \$0b01001110,$Xl,$Xm
pxor $Xl,$Xm
pclmulqdq \$0x00,$Hkey3,$Xl
pxor $T2,$Xi #
pxor $T1,$Xhi
psrlq \$1,$Xi #
pclmulqdq \$0x11,$Hkey3,$Xh
xorps $Xl,$Xln
pxor $Xhi,$Xi #
pclmulqdq \$0x00,$HK,$Xm
xorps $Xh,$Xhn
movdqa $Xi,$Xhi
pshufd \$0b01001110,$Xi,$T1
pxor $Xi,$T1
lea 0x40($inp),$inp
sub \$0x40,$len
jnc .Lmod4_loop
.Ltail4x:
pclmulqdq \$0x00,$Hkey4,$Xi
xorps $Xm,$Xmn
pclmulqdq \$0x11,$Hkey4,$Xhi
xorps $Xln,$Xi
pclmulqdq \$0x10,$HK,$T1
xorps $Xhn,$Xhi
pxor $Xi,$Xhi # aggregated Karatsuba post-processing
pxor $Xmn,$T1
pxor $Xhi,$T1 #
pxor $Xi,$Xhi
movdqa $T1,$T2 #
psrldq \$8,$T1
pslldq \$8,$T2 #
pxor $T1,$Xhi
pxor $T2,$Xi #
___
&reduction_alg9
(
$Xhi
,
$Xi
);
$code
.=
<<___;
add \$0x40,$len
jz .Ldone
sub \$0x10,$len
movdqu 0x20($Htbl),$HK
.Lskip4x:
___
}
$code
.=
<<___;
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
movdqu ($inp),$T1 # Ii
movdqu 16($inp),$Xn # Ii+1
movdqu 16($inp),$X
l
n # Ii+1
pshufb $T3,$T1
pshufb $T3,$Xn
pshufb $T3,$X
l
n
pxor $T1,$Xi # Ii+Xi
___
&clmul64x64_T2
(
$Xhn
,
$Xn
,
$Hkey
);
# H*Ii+1
$code
.=
<<___;
movdqa $Xi,$Xhi #
pshufd \$0b01001110,$Xi,$T1
pshufd \$0b01001110,$Hkey2,$T2
movdqa $Xln,$Xhn
pshufd \$0b01001110,$Xln,$Xmn
pxor $Xln,$Xmn
pclmulqdq \$0x00,$Hkey,$Xln
pclmulqdq \$0x11,$Hkey,$Xhn
pclmulqdq \$0x00,$HK,$Xmn
movdqa $Xi,$Xhi
pshufd \$0b01001110,$Xi,$T1 #
pxor $Xi,$T1 #
pxor $Hkey2,$T2
lea 32($inp),$inp # i+=2
sub \$0x20,$len
jbe .Leven_tail
jmp .Lmod_loop
.align 32
.Lmod_loop:
___
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey2
,
1
);
# H^2*(Ii+Xi)
$code
.=
<<___;
movdqu ($inp),$T1 # Ii
pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
movdqu ($inp),$T2 # Ii
pclmulqdq \$0x10,$HK,$T1
pshufb $T3,$T2
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
movdqu 16($inp),$Xln # Ii+1
pxor $Xhn,$Xhi
movdqu 16($inp),$Xn # Ii+1
pshufb $T3,$T1
pshufb $T3,$Xn
pxor $Xi,$Xmn # aggregated Karatsuba post-processing
pxor $Xhi,$Xmn
pxor $T2,$Xhi # "Ii+Xi", consume early
pxor $Xmn,$T1
pshufb $T3,$Xln
movdqa $T1,$T2 #
psrldq \$8,$T1
pslldq \$8,$T2 #
pxor $T1,$Xhi
pxor $T2,$Xi #
movdqa $Xn,$Xhn #
pshufd \$0b01001110,$Xn,$T1n
pshufd \$0b01001110,$Hkey,$T2n
pxor $Xn,$T1n #
pxor $Hkey,$T2n
pxor $T1,$Xhi # "Ii+Xi", consume early
movdqa $Xln,$Xhn #
pshufd \$0b01001110,$Xln,$Xmn
pxor $Xln,$Xmn #
movdqa $Xi,$T1 # 1st phase
movdqa $Xi,$T2 # 1st phase
movdqa $Xi,$T1
psllq \$5,$Xi
pclmulqdq \$0x00,$Hkey,$Xln #######
pxor $Xi,$T1 #
psllq \$1,$Xi
pxor $T1,$Xi #
psllq \$5,$Xi #
pxor $T1,$Xi #
pclmulqdq \$0x00,$Hkey,$Xn #######
psllq \$57,$Xi #
movdqa $Xi,$T
2
#
movdqa $Xi,$T
1
#
pslldq \$8,$Xi
psrldq \$8,$T
2
#
pxor $T
1
,$Xi
pxor $T
2
,$Xhi #
psrldq \$8,$T
1
#
pxor $T
2
,$Xi
pxor $T
1
,$Xhi #
pclmulqdq \$0x11,$Hkey,$Xhn #######
movdqa $Xi,$T2 # 2nd phase
psrlq \$1,$Xi
pxor $T2,$Xhi #
pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
psrlq \$1,$Xi #
pxor $T2,$Xi #
pxor $Xhi,$T2
psrlq \$1,$Xi #
pxor $T2,$Xi #
pclmulqdq \$0x00,$HK,$Xmn #######
pxor $Xhi,$Xi #
pclmulqdq \$0x00,$T2n,$T1n #######
movdqa $Xi,$Xhi #
pshufd \$0b01001110,$Xi,$T1
pshufd \$0b01001110,$Hkey2,$T2
movdqa $Xi,$Xhi
pshufd \$0b01001110,$Xi,$T1 #
pxor $Xi,$T1 #
pxor $Hkey2,$T2
pxor $Xn,$T1n #
pxor $Xhn,$T1n #
movdqa $T1n,$T2n #
psrldq \$8,$T1n
pslldq \$8,$T2n #
pxor $T1n,$Xhn
pxor $T2n,$Xn #
lea 32($inp),$inp
sub \$0x20,$len
ja .Lmod_loop
.Leven_tail:
___
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey2
,
1
);
# H^2*(Ii+Xi)
$code
.=
<<___;
pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
pclmulqdq \$0x10,$HK,$T1
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
pxor $Xi,$Xmn
pxor $Xhi,$Xmn
pxor $Xmn,$T1
movdqa $T1,$T2 #
psrldq \$8,$T1
pslldq \$8,$T2 #
pxor $T1,$Xhi
pxor $T2,$Xi #
___
&reduction_alg9
(
$Xhi
,
$Xi
);
$code
.=
<<___;
...
...
@@ -601,7 +873,7 @@ $code.=<<___;
pshufb $T3,$T1
pxor $T1,$Xi # Ii+Xi
___
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
# H*(Ii+Xi)
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
,
$HK
);
# H*(Ii+Xi)
&reduction_alg9
(
$Xhi
,
$Xi
);
$code
.=
<<___;
.Ldone:
...
...
@@ -614,7 +886,12 @@ $code.=<<___ if ($win64);
movaps
0x20
(
%rsp
),
%xmm8
movaps
0x30
(
%rsp
),
%xmm9
movaps
0x40
(
%rsp
),
%xmm10
add
\
$
0x58
,
%rsp
movaps
0x50
(
%rsp
),
%xmm11
movaps
0x60
(
%rsp
),
%xmm12
movaps
0x70
(
%rsp
),
%xmm13
movaps
0x80
(
%rsp
),
%xmm14
movaps
0x90
(
%rsp
),
%xmm15
lea
0xa8
(
%rsp
),
%rsp
___
$code
.=
<<___;
ret
...
...
@@ -629,6 +906,10 @@ $code.=<<___;
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.L0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
.L7_mask:
.long 7,0,7,0
.L7_mask_poly:
.long 7,0,`0xE1<<1`,0
.align 64
.type .Lrem_4bit,\@object
.Lrem_4bit:
...
...
@@ -791,13 +1072,18 @@ se_handler:
.rva se_handler
.rva .Lghash_prologue,.Lghash_epilogue # HandlerData
.LSEH_info_gcm_ghash_clmul:
.byte 0x01,0x1f,0x0b,0x00
.byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
.byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
.byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
.byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
.byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
.byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
.byte 0x01,0x33,0x16,0x00
.byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
.byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
.byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
.byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
.byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
.byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
.byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
.byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
.byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
.byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
.byte 0x04,0x01,0x15,0x00 #sub 0xa8,rsp
___
}
...
...
crypto/modes/gcm128.c
浏览文件 @
273a8081
...
...
@@ -1703,6 +1703,21 @@ static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0
0xa2
,
0x41
,
0x89
,
0x97
,
0x20
,
0x0e
,
0xf8
,
0x2e
,
0x44
,
0xae
,
0x7e
,
0x3f
},
T18
[]
=
{
0xa4
,
0x4a
,
0x82
,
0x66
,
0xee
,
0x1c
,
0x8e
,
0xb0
,
0xc8
,
0xb5
,
0xd4
,
0xcf
,
0x5a
,
0xe9
,
0xf1
,
0x9a
};
/* Test Case 19 */
#define K19 K1
#define P19 P1
#define IV19 IV1
#define C19 C1
static
const
u8
A19
[]
=
{
0xd9
,
0x31
,
0x32
,
0x25
,
0xf8
,
0x84
,
0x06
,
0xe5
,
0xa5
,
0x59
,
0x09
,
0xc5
,
0xaf
,
0xf5
,
0x26
,
0x9a
,
0x86
,
0xa7
,
0xa9
,
0x53
,
0x15
,
0x34
,
0xf7
,
0xda
,
0x2e
,
0x4c
,
0x30
,
0x3d
,
0x8a
,
0x31
,
0x8a
,
0x72
,
0x1c
,
0x3c
,
0x0c
,
0x95
,
0x95
,
0x68
,
0x09
,
0x53
,
0x2f
,
0xcf
,
0x0e
,
0x24
,
0x49
,
0xa6
,
0xb5
,
0x25
,
0xb1
,
0x6a
,
0xed
,
0xf5
,
0xaa
,
0x0d
,
0xe6
,
0x57
,
0xba
,
0x63
,
0x7b
,
0x39
,
0x1a
,
0xaf
,
0xd2
,
0x55
,
0x52
,
0x2d
,
0xc1
,
0xf0
,
0x99
,
0x56
,
0x7d
,
0x07
,
0xf4
,
0x7f
,
0x37
,
0xa3
,
0x2a
,
0x84
,
0x42
,
0x7d
,
0x64
,
0x3a
,
0x8c
,
0xdc
,
0xbf
,
0xe5
,
0xc0
,
0xc9
,
0x75
,
0x98
,
0xa2
,
0xbd
,
0x25
,
0x55
,
0xd1
,
0xaa
,
0x8c
,
0xb0
,
0x8e
,
0x48
,
0x59
,
0x0d
,
0xbb
,
0x3d
,
0xa7
,
0xb0
,
0x8b
,
0x10
,
0x56
,
0x82
,
0x88
,
0x38
,
0xc5
,
0xf6
,
0x1e
,
0x63
,
0x93
,
0xba
,
0x7a
,
0x0a
,
0xbc
,
0xc9
,
0xf6
,
0x62
,
0x89
,
0x80
,
0x15
,
0xad
},
T19
[]
=
{
0x5f
,
0xea
,
0x79
,
0x3a
,
0x2d
,
0x6f
,
0x97
,
0x4d
,
0x37
,
0xe6
,
0x8e
,
0x0c
,
0xb8
,
0xff
,
0x94
,
0x92
};
#define TEST_CASE(n) do { \
u8 out[sizeof(P##n)]; \
AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
...
...
@@ -1747,6 +1762,7 @@ int main()
TEST_CASE
(
16
);
TEST_CASE
(
17
);
TEST_CASE
(
18
);
TEST_CASE
(
19
);
#ifdef OPENSSL_CPUID_OBJ
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录