Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
1da5d302
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
1 年多 前同步成功
通知
9
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1da5d302
编写于
3月 24, 2013
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
ghash-x86_64.pl: add AVX code path.
上级
1bc4d009
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
692 addition
and
9 deletion
+692
-9
crypto/modes/asm/ghash-x86_64.pl
crypto/modes/asm/ghash-x86_64.pl
+647
-6
crypto/modes/gcm128.c
crypto/modes/gcm128.c
+45
-3
未找到文件。
crypto/modes/asm/ghash-x86_64.pl
浏览文件 @
1da5d302
...
...
@@ -64,6 +64,18 @@
# Ivy Bridge 1.79(+8%)
# Bulldozer 1.52(+25%)
# March 2013
#
# ... 8x aggregate factor AVX code path is using reduction algorithm
# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
# sub-optimally in comparison to above mentioned version. But thanks
# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we know that
# it will perform better on upcoming Haswell processor. [Exact
# performance numbers to be added at launch.]
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
$flavour
=
shift
;
$output
=
shift
;
if
(
$flavour
=~
/\./
)
{
$output
=
$flavour
;
undef
$flavour
;
}
...
...
@@ -75,6 +87,21 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
(
$xlate
=
"
${dir}
../../perlasm/x86_64-xlate.pl
"
and
-
f
$xlate
)
or
die
"
can't locate x86_64-xlate.pl
";
if
(`
$ENV
{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1
`
=~
/GNU assembler version ([2-9]\.[0-9]+)/
)
{
$avx
=
(
$
1
>=
2.19
)
+
(
$
1
>=
2.22
);
}
if
(
!
$avx
&&
$win64
&&
(
$flavour
=~
/nasm/
||
$ENV
{
ASM
}
=~
/nasm/
)
&&
`
nasm -v 2>&1
`
=~
/NASM version ([2-9]\.[0-9]+)/
)
{
$avx
=
(
$
1
>=
2.09
)
+
(
$
1
>=
2.10
);
}
if
(
!
$avx
&&
$win64
&&
(
$flavour
=~
/masm/
||
$ENV
{
ASM
}
=~
/ml64/
)
&&
`
ml64 2>&1
`
=~
/Version ([0-9]+)\./
)
{
$avx
=
(
$
1
>=
10
)
+
(
$
1
>=
11
);
}
open
OUT
,"
|
\"
$^X
\"
$xlate
$flavour
$output
";
*STDOUT
=
*OUT
;
...
...
@@ -442,12 +469,22 @@ ___
}
{
my
(
$Htbl
,
$Xip
)
=
@
_4args
;
my
$HK
=
"
%xmm6
";
$code
.=
<<___;
.globl gcm_init_clmul
.type gcm_init_clmul,\@abi-omnipotent
.align 16
gcm_init_clmul:
.L_init_clmul:
___
$code
.=<<
___
if
(
$win64
);
.
LSEH_begin_gcm_init_clmul:
# I can't trust assembler to use specific encoding:-(
.
byte
0x48
,
0x83
,
0xec
,
0x18
#sub $0x18,%rsp
.
byte
0x0f
,
0x29
,
0x34
,
0x24
#movaps %xmm6,(%rsp)
___
$code
.=
<<___;
movdqu ($Xip),$Hkey
pshufd \$0b01001110,$Hkey,$Hkey # dword swap
...
...
@@ -466,9 +503,11 @@ gcm_init_clmul:
pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
# calculate H^2
pshufd \$0b01001110,$Hkey,$HK
movdqa $Hkey,$Xi
pxor $Hkey,$HK
___
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
,
$HK
);
&reduction_alg9
(
$Xhi
,
$Xi
);
$code
.=
<<___;
pshufd \$0b01001110,$Hkey,$T1
...
...
@@ -481,12 +520,12 @@ $code.=<<___;
movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
___
if
(
$do4xaggr
)
{
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
# H^3
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
,
$HK
);
# H^3
&reduction_alg9
(
$Xhi
,
$Xi
);
$code
.=
<<___;
movdqa $Xi,$T3
___
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
);
# H^4
&clmul64x64_T2
(
$Xhi
,
$Xi
,
$Hkey
,
$HK
);
# H^4
&reduction_alg9
(
$Xhi
,
$Xi
);
$code
.=
<<___;
pshufd \$0b01001110,$T3,$T1
...
...
@@ -495,10 +534,15 @@ $code.=<<___;
movdqu $T3,0x30($Htbl) # save H^3
pxor $Xi,$T2 # Karatsuba pre-processing
movdqu $Xi,0x40($Htbl) # save H^4
palignr \$8,$T1,$T2 # low part is H
.lo^H
.hi...
palignr \$8,$T1,$T2 # low part is H
^3.lo^H^3
.hi...
movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
___
}
$code
.=<<
___
if
(
$win64
);
movaps
(
%rsp
),
%xmm6
lea
0x18
(
%rsp
),
%rsp
.
LSEH_end_gcm_init_clmul:
___
$code
.=
<<___;
ret
.size gcm_init_clmul,.-gcm_init_clmul
...
...
@@ -512,6 +556,7 @@ $code.=<<___;
.type gcm_gmult_clmul,\@abi-omnipotent
.align 16
gcm_gmult_clmul:
.L_gmult_clmul:
movdqu ($Xip),$Xi
movdqa .Lbswap_mask(%rip),$T3
movdqu ($Htbl),$Hkey
...
...
@@ -559,6 +604,7 @@ $code.=<<___;
.type gcm_ghash_clmul,\@abi-omnipotent
.align 32
gcm_ghash_clmul:
.L_ghash_clmul:
___
$code
.=<<
___
if
(
$win64
);
lea
-
0x88
(
%rsp
),
%rax
...
...
@@ -893,14 +939,591 @@ $code.=<<___ if ($win64);
movaps
0x80
(
%rsp
),
%xmm14
movaps
0x90
(
%rsp
),
%xmm15
lea
0xa8
(
%rsp
),
%rsp
.
LSEH_end_gcm_ghash_clmul:
___
$code
.=
<<___;
ret
.LSEH_end_gcm_ghash_clmul:
.size gcm_ghash_clmul,.-gcm_ghash_clmul
___
}
$code
.=
<<___;
.globl gcm_init_avx
.type gcm_init_avx,\@abi-omnipotent
.align 32
gcm_init_avx:
___
if
(
$avx
)
{
my
(
$Htbl
,
$Xip
)
=
@
_4args
;
my
$HK
=
"
%xmm6
";
$code
.=<<
___
if
(
$win64
);
.
LSEH_begin_gcm_init_avx:
# I can't trust assembler to use specific encoding:-(
.
byte
0x48
,
0x83
,
0xec
,
0x18
#sub $0x18,%rsp
.
byte
0x0f
,
0x29
,
0x34
,
0x24
#movaps %xmm6,(%rsp)
___
$code
.=
<<___;
vzeroupper
vmovdqu ($Xip),$Hkey
vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
# <<1 twist
vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
vpsrlq \$63,$Hkey,$T1
vpsllq \$1,$Hkey,$Hkey
vpxor $T3,$T3,$T3 #
vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
vpslldq \$8,$T1,$T1
vpor $T1,$Hkey,$Hkey # H<<=1
# magic reduction
vpand .L0x1c2_polynomial(%rip),$T3,$T3
vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
vpunpckhqdq $Hkey,$Hkey,$HK
vmovdqa $Hkey,$Xi
vpxor $Hkey,$HK,$HK
mov \$4,%r10 # up to H^8
jmp .Linit_start_avx
___
sub
clmul64x64_avx
{
my
(
$Xhi
,
$Xi
,
$Hkey
,
$HK
)
=
@_
;
if
(
!
defined
(
$HK
))
{
$HK
=
$T2
;
$code
.=
<<___;
vpunpckhqdq $Xi,$Xi,$T1
vpunpckhqdq $Hkey,$Hkey,$T2
vpxor $Xi,$T1,$T1 #
vpxor $Hkey,$T2,$T2
___
}
else
{
$code
.=
<<___;
vpunpckhqdq $Xi,$Xi,$T1
vpxor $Xi,$T1,$T1 #
___
}
$code
.=
<<___;
vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
vpclmulqdq \$0x00,$HK,$T1,$T1 #######
vpxor $Xi,$Xhi,$T2 #
vpxor $T2,$T1,$T1 #
vpslldq \$8,$T1,$T2 #
vpsrldq \$8,$T1,$T1
vpxor $T2,$Xi,$Xi #
vpxor $T1,$Xhi,$Xhi
___
}
sub
reduction_avx
{
my
(
$Xhi
,
$Xi
)
=
@_
;
$code
.=
<<___;
vpsllq \$57,$Xi,$T1 # 1st phase
vpsllq \$62,$Xi,$T2
vpxor $T1,$T2,$T2 #
vpsllq \$63,$Xi,$T1
vpxor $T1,$T2,$T2 #
vpslldq \$8,$T2,$T1 #
vpsrldq \$8,$T2,$T2
vpxor $T1,$Xi,$Xi #
vpxor $T2,$Xhi,$Xhi
vpsrlq \$1,$Xi,$T2 # 2nd phase
vpxor $Xi,$Xhi,$Xhi
vpxor $T2,$Xi,$Xi #
vpsrlq \$5,$T2,$T2
vpxor $T2,$Xi,$Xi #
vpsrlq \$1,$Xi,$Xi #
vpxor $Xhi,$Xi,$Xi #
___
}
$code
.=
<<___;
.align 32
.Linit_loop_avx:
vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
___
&clmul64x64_avx
(
$Xhi
,
$Xi
,
$Hkey
,
$HK
);
# calculate H^3,5,7
&reduction_avx
(
$Xhi
,
$Xi
);
$code
.=
<<___;
.Linit_start_avx:
vmovdqa $Xi,$T3
___
&clmul64x64_avx
(
$Xhi
,
$Xi
,
$Hkey
,
$HK
);
# calculate H^2,4,6,8
&reduction_avx
(
$Xhi
,
$Xi
);
$code
.=
<<___;
vpshufd \$0b01001110,$T3,$T1
vpshufd \$0b01001110,$Xi,$T2
vpxor $T3,$T1,$T1 # Karatsuba pre-processing
vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
lea 0x30($Htbl),$Htbl
sub \$1,%r10
jnz .Linit_loop_avx
vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
vmovdqu $T3,-0x10($Htbl)
vzeroupper
___
$code
.=<<
___
if
(
$win64
);
movaps
(
%rsp
),
%xmm6
lea
0x18
(
%rsp
),
%rsp
.
LSEH_end_gcm_init_avx:
___
$code
.=
<<___;
ret
.size gcm_init_avx,.-gcm_init_avx
___
}
else
{
$code
.=
<<___;
jmp .L_init_clmul
.size gcm_init_avx,.-gcm_init_avx
___
}
$code
.=
<<___;
.globl gcm_gmult_avx
.type gcm_gmult_avx,\@abi-omnipotent
.align 32
gcm_gmult_avx:
jmp .L_gmult_clmul
.size gcm_gmult_avx,.-gcm_gmult_avx
___
$code
.=
<<___;
.globl gcm_ghash_avx
.type gcm_ghash_avx,\@abi-omnipotent
.align 32
gcm_ghash_avx:
___
if
(
$avx
)
{
my
(
$Xip
,
$Htbl
,
$inp
,
$len
)
=
@
_4args
;
my
(
$Xlo
,
$Xhi
,
$Xmi
,
$Zlo
,
$Zhi
,
$Zmi
,
$Hkey
,
$HK
,
$T1
,
$T2
,
$Xi
,
$Xo
,
$Tred
,
$bswap
,
$Ii
,
$Ij
)
=
map
("
%xmm
$_
",(
0
..
15
));
$code
.=<<
___
if
(
$win64
);
lea
-
0x88
(
%rsp
),
%rax
.
LSEH_begin_gcm_ghash_avx:
# I can't trust assembler to use specific encoding:-(
.
byte
0x48
,
0x8d
,
0x60
,
0xe0
#lea -0x20(%rax),%rsp
.
byte
0x0f
,
0x29
,
0x70
,
0xe0
#movaps %xmm6,-0x20(%rax)
.
byte
0x0f
,
0x29
,
0x78
,
0xf0
#movaps %xmm7,-0x10(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x00
#movaps %xmm8,0(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x48
,
0x10
#movaps %xmm9,0x10(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x50
,
0x20
#movaps %xmm10,0x20(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x58
,
0x30
#movaps %xmm11,0x30(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x60
,
0x40
#movaps %xmm12,0x40(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x68
,
0x50
#movaps %xmm13,0x50(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x70
,
0x60
#movaps %xmm14,0x60(%rax)
.
byte
0x44
,
0x0f
,
0x29
,
0x78
,
0x70
#movaps %xmm15,0x70(%rax)
___
$code
.=
<<___;
vzeroupper
vmovdqu ($Xip),$Xi # load $Xi
lea .L0x1c2_polynomial(%rip),%r10
lea 0x40($Htbl),$Htbl # size optimization
vmovdqu .Lbswap_mask(%rip),$bswap
vpshufb $bswap,$Xi,$Xi
cmp \$0x80,$len
jb .Lshort_avx
sub \$0x80,$len
vmovdqu 0x70($inp),$Ii # I[7]
vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
vpshufb $bswap,$Ii,$Ii
vmovdqu 0x20-0x40($Htbl),$HK
vpunpckhqdq $Ii,$Ii,$T2
vmovdqu 0x60($inp),$Ij # I[6]
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpxor $Ii,$T2,$T2
vpshufb $bswap,$Ij,$Ij
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
vpunpckhqdq $Ij,$Ij,$T1
vmovdqu 0x50($inp),$Ii # I[5]
vpclmulqdq \$0x00,$HK,$T2,$Xmi
vpxor $Ij,$T1,$T1
vpshufb $bswap,$Ii,$Ii
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpunpckhqdq $Ii,$Ii,$T2
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
vpxor $Ii,$T2,$T2
vmovdqu 0x40($inp),$Ij # I[4]
vpclmulqdq \$0x10,$HK,$T1,$Zmi
vmovdqu 0x50-0x40($Htbl),$HK
vpshufb $bswap,$Ij,$Ij
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpxor $Xhi,$Zhi,$Zhi
vpunpckhqdq $Ij,$Ij,$T1
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T2,$Xmi
vpxor $Ij,$T1,$T1
vmovdqu 0x30($inp),$Ii # I[3]
vpxor $Zlo,$Xlo,$Xlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpxor $Zhi,$Xhi,$Xhi
vpshufb $bswap,$Ii,$Ii
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
vpxor $Zmi,$Xmi,$Xmi
vpunpckhqdq $Ii,$Ii,$T2
vpclmulqdq \$0x10,$HK,$T1,$Zmi
vmovdqu 0x80-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vmovdqu 0x20($inp),$Ij # I[2]
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpxor $Xhi,$Zhi,$Zhi
vpshufb $bswap,$Ij,$Ij
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
vpxor $Xmi,$Zmi,$Zmi
vpunpckhqdq $Ij,$Ij,$T1
vpclmulqdq \$0x00,$HK,$T2,$Xmi
vpxor $Ij,$T1,$T1
vmovdqu 0x10($inp),$Ii # I[1]
vpxor $Zlo,$Xlo,$Xlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpxor $Zhi,$Xhi,$Xhi
vpshufb $bswap,$Ii,$Ii
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
vpxor $Zmi,$Xmi,$Xmi
vpunpckhqdq $Ii,$Ii,$T2
vpclmulqdq \$0x10,$HK,$T1,$Zmi
vmovdqu 0xb0-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vmovdqu ($inp),$Ij # I[0]
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpxor $Xhi,$Zhi,$Zhi
vpshufb $bswap,$Ij,$Ij
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x10,$HK,$T2,$Xmi
lea 0x80($inp),$inp
cmp \$0x80,$len
jb .Ltail_avx
vpxor $Xi,$Ij,$Ij # accumulate $Xi
sub \$0x80,$len
jmp .Loop8x_avx
.align 32
.Loop8x_avx:
vpunpckhqdq $Ij,$Ij,$T1
vmovdqu 0x70($inp),$Ii # I[7]
vpxor $Xlo,$Zlo,$Zlo
vpxor $Ij,$T1,$T1
vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
vpshufb $bswap,$Ii,$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
vpunpckhqdq $Ii,$Ii,$T2
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Tred
vmovdqu 0x20-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vmovdqu 0x60($inp),$Ij # I[6]
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpxor $Zlo,$Xi,$Xi # collect result
vpshufb $bswap,$Ij,$Ij
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vxorps $Zhi,$Xo,$Xo
vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
vpunpckhqdq $Ij,$Ij,$T1
vpclmulqdq \$0x00,$HK, $T2,$Xmi
vpxor $Zmi,$Tred,$Tred
vxorps $Ij,$T1,$T1
vmovdqu 0x50($inp),$Ii # I[5]
vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpxor $Xo,$Tred,$Tred
vpslldq \$8,$Tred,$T2
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vpsrldq \$8,$Tred,$Tred
vpxor $T2, $Xi, $Xi
vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
vpshufb $bswap,$Ii,$Ii
vxorps $Tred,$Xo, $Xo
vpxor $Xhi,$Zhi,$Zhi
vpunpckhqdq $Ii,$Ii,$T2
vpclmulqdq \$0x10,$HK, $T1,$Zmi
vmovdqu 0x50-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vpxor $Xmi,$Zmi,$Zmi
vmovdqu 0x40($inp),$Ij # I[4]
vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpshufb $bswap,$Ij,$Ij
vpxor $Zlo,$Xlo,$Xlo
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Zhi,$Xhi,$Xhi
vpclmulqdq \$0x00,$HK, $T2,$Xmi
vxorps $Ij,$T1,$T1
vpxor $Zmi,$Xmi,$Xmi
vmovdqu 0x30($inp),$Ii # I[3]
vpclmulqdq \$0x10,(%r10),$Xi,$Xi
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpshufb $bswap,$Ii,$Ii
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
vpunpckhqdq $Ii,$Ii,$T2
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x10,$HK, $T1,$Zmi
vmovdqu 0x80-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vpxor $Xmi,$Zmi,$Zmi
vmovdqu 0x20($inp),$Ij # I[2]
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpshufb $bswap,$Ij,$Ij
vpxor $Zlo,$Xlo,$Xlo
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Zhi,$Xhi,$Xhi
vpclmulqdq \$0x00,$HK, $T2,$Xmi
vpxor $Ij,$T1,$T1
vpxor $Zmi,$Xmi,$Xmi
vxorps $Tred,$Xi,$Xi
vmovdqu 0x10($inp),$Ii # I[1]
vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpshufb $bswap,$Ii,$Ii
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
vpclmulqdq \$0x10,(%r10),$Xi,$Xi
vxorps $Xo,$Tred,$Tred
vpunpckhqdq $Ii,$Ii,$T2
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x10,$HK, $T1,$Zmi
vmovdqu 0xb0-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vpxor $Xmi,$Zmi,$Zmi
vmovdqu ($inp),$Ij # I[0]
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpshufb $bswap,$Ij,$Ij
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
vpxor $Tred,$Ij,$Ij
vpclmulqdq \$0x10,$HK, $T2,$Xmi
vpxor $Xi,$Ij,$Ij # accumulate $Xi
lea 0x80($inp),$inp
sub \$0x80,$len
jnc .Loop8x_avx
add \$0x80,$len
jmp .Ltail_no_xor_avx
.align 32
.Lshort_avx:
vmovdqu -0x10($inp,$len),$Ii # very last word
lea ($inp,$len),$inp
vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
vmovdqu 0x20-0x40($Htbl),$HK
vpshufb $bswap,$Ii,$Ij
vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
vmovdqa $Xhi,$Zhi # $Zhi and
vmovdqa $Xmi,$Zmi # $Zmi
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x20($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vpsrldq \$8,$HK,$HK
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x30($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vmovdqu 0x50-0x40($Htbl),$HK
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x40($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vpsrldq \$8,$HK,$HK
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x50($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vmovdqu 0x80-0x40($Htbl),$HK
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x60($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vpsrldq \$8,$HK,$HK
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x70($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vmovq 0xb8-0x40($Htbl),$HK
sub \$0x10,$len
jmp .Ltail_avx
.align 32
.Ltail_avx:
vpxor $Xi,$Ij,$Ij # accumulate $Xi
.Ltail_no_xor_avx:
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vmovdqu (%r10),$Tred
vpxor $Xlo,$Zlo,$Xi
vpxor $Xhi,$Zhi,$Xo
vpxor $Xmi,$Zmi,$Zmi
vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
vpxor $Xo, $Zmi,$Zmi
vpslldq \$8, $Zmi,$T2
vpsrldq \$8, $Zmi,$Zmi
vpxor $T2, $Xi, $Xi
vpxor $Zmi,$Xo, $Xo
vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
vpalignr \$8,$Xi,$Xi,$Xi
vpxor $T2,$Xi,$Xi
vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
vpalignr \$8,$Xi,$Xi,$Xi
vpxor $Xo,$Xi,$Xi
vpxor $T2,$Xi,$Xi
cmp \$0,$len
jne .Lshort_avx
vpshufb $bswap,$Xi,$Xi
vmovdqu $Xi,($Xip)
vzeroupper
___
$code
.=<<
___
if
(
$win64
);
movaps
(
%rsp
),
%xmm6
movaps
0x10
(
%rsp
),
%xmm7
movaps
0x20
(
%rsp
),
%xmm8
movaps
0x30
(
%rsp
),
%xmm9
movaps
0x40
(
%rsp
),
%xmm10
movaps
0x50
(
%rsp
),
%xmm11
movaps
0x60
(
%rsp
),
%xmm12
movaps
0x70
(
%rsp
),
%xmm13
movaps
0x80
(
%rsp
),
%xmm14
movaps
0x90
(
%rsp
),
%xmm15
lea
0xa8
(
%rsp
),
%rsp
.
LSEH_end_gcm_ghash_avx:
___
$code
.=
<<___;
ret
.size gcm_ghash_avx,.-gcm_ghash_avx
___
}
else
{
$code
.=
<<___;
jmp .L_ghash_clmul
.size gcm_ghash_avx,.-gcm_ghash_avx
___
}
$code
.=
<<___;
.align 64
.Lbswap_mask:
...
...
@@ -1058,10 +1681,24 @@ se_handler:
.rva .LSEH_end_gcm_ghash_4bit
.rva .LSEH_info_gcm_ghash_4bit
.rva .LSEH_begin_gcm_init_clmul
.rva .LSEH_end_gcm_init_clmul
.rva .LSEH_info_gcm_init_clmul
.rva .LSEH_begin_gcm_ghash_clmul
.rva .LSEH_end_gcm_ghash_clmul
.rva .LSEH_info_gcm_ghash_clmul
___
$code
.=<<
___
if
(
$avx
);
.
rva
.
LSEH_begin_gcm_init_avx
.
rva
.
LSEH_end_gcm_init_avx
.
rva
.
LSEH_info_gcm_init_clmul
.
rva
.
LSEH_begin_gcm_ghash_avx
.
rva
.
LSEH_end_gcm_ghash_avx
.
rva
.
LSEH_info_gcm_ghash_clmul
___
$code
.=
<<___;
.section .xdata
.align 8
.LSEH_info_gcm_gmult_4bit:
...
...
@@ -1072,6 +1709,10 @@ se_handler:
.byte 9,0,0,0
.rva se_handler
.rva .Lghash_prologue,.Lghash_epilogue # HandlerData
.LSEH_info_gcm_init_clmul:
.byte 0x01,0x08,0x03,0x00
.byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
.byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
.LSEH_info_gcm_ghash_clmul:
.byte 0x01,0x33,0x16,0x00
.byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
...
...
@@ -1084,7 +1725,7 @@ se_handler:
.byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
.byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
.byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
.byte 0x04,0x01,0x15,0x00 #sub
0xa8,rsp
.byte 0x04,0x01,0x15,0x00 #sub
rsp,0xa8
___
}
...
...
crypto/modes/gcm128.c
浏览文件 @
1da5d302
...
...
@@ -658,6 +658,16 @@ void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
void
gcm_gmult_clmul
(
u64
Xi
[
2
],
const
u128
Htable
[
16
]);
void
gcm_ghash_clmul
(
u64
Xi
[
2
],
const
u128
Htable
[
16
],
const
u8
*
inp
,
size_t
len
);
#if defined(__i386) || defined(__i386__)
# define gcm_init_avx gcm_init_clmul
# define gcm_gmult_avx gcm_gmult_clmul
# define gcm_ghash_avx gcm_ghash_clmul
#else
void
gcm_init_avx
(
u128
Htable
[
16
],
const
u64
Xi
[
2
]);
void
gcm_gmult_avx
(
u64
Xi
[
2
],
const
u128
Htable
[
16
]);
void
gcm_ghash_avx
(
u64
Xi
[
2
],
const
u128
Htable
[
16
],
const
u8
*
inp
,
size_t
len
);
#endif
# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
# define GHASH_ASM_X86
void
gcm_gmult_4bit_mmx
(
u64
Xi
[
2
],
const
u128
Htable
[
16
]);
...
...
@@ -726,9 +736,15 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
if
(
OPENSSL_ia32cap_P
[
0
]
&
(
1
<<
24
)
&&
/* check FXSR bit */
OPENSSL_ia32cap_P
[
1
]
&
(
1
<<
1
)
)
{
/* check PCLMULQDQ bit */
if
(((
OPENSSL_ia32cap_P
[
1
]
>>
22
)
&
0x41
)
==
0x41
)
{
/* AVX+MOVBE */
gcm_init_avx
(
ctx
->
Htable
,
ctx
->
H
.
u
);
ctx
->
gmult
=
gcm_gmult_avx
;
ctx
->
ghash
=
gcm_ghash_avx
;
}
else
{
gcm_init_clmul
(
ctx
->
Htable
,
ctx
->
H
.
u
);
ctx
->
gmult
=
gcm_gmult_clmul
;
ctx
->
ghash
=
gcm_ghash_clmul
;
}
return
;
}
# endif
...
...
@@ -1718,6 +1734,31 @@ static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0
0xc5
,
0xf6
,
0x1e
,
0x63
,
0x93
,
0xba
,
0x7a
,
0x0a
,
0xbc
,
0xc9
,
0xf6
,
0x62
,
0x89
,
0x80
,
0x15
,
0xad
},
T19
[]
=
{
0x5f
,
0xea
,
0x79
,
0x3a
,
0x2d
,
0x6f
,
0x97
,
0x4d
,
0x37
,
0xe6
,
0x8e
,
0x0c
,
0xb8
,
0xff
,
0x94
,
0x92
};
/* Test Case 20 */
#define K20 K1
#define A20 A1
static
const
u8
IV20
[
64
]
=
{
0xff
,
0xff
,
0xff
,
0xff
},
/* this results in 0xff in counter LSB */
P20
[
288
],
C20
[]
=
{
0x56
,
0xb3
,
0x37
,
0x3c
,
0xa9
,
0xef
,
0x6e
,
0x4a
,
0x2b
,
0x64
,
0xfe
,
0x1e
,
0x9a
,
0x17
,
0xb6
,
0x14
,
0x25
,
0xf1
,
0x0d
,
0x47
,
0xa7
,
0x5a
,
0x5f
,
0xce
,
0x13
,
0xef
,
0xc6
,
0xbc
,
0x78
,
0x4a
,
0xf2
,
0x4f
,
0x41
,
0x41
,
0xbd
,
0xd4
,
0x8c
,
0xf7
,
0xc7
,
0x70
,
0x88
,
0x7a
,
0xfd
,
0x57
,
0x3c
,
0xca
,
0x54
,
0x18
,
0xa9
,
0xae
,
0xff
,
0xcd
,
0x7c
,
0x5c
,
0xed
,
0xdf
,
0xc6
,
0xa7
,
0x83
,
0x97
,
0xb9
,
0xa8
,
0x5b
,
0x49
,
0x9d
,
0xa5
,
0x58
,
0x25
,
0x72
,
0x67
,
0xca
,
0xab
,
0x2a
,
0xd0
,
0xb2
,
0x3c
,
0xa4
,
0x76
,
0xa5
,
0x3c
,
0xb1
,
0x7f
,
0xb4
,
0x1c
,
0x4b
,
0x8b
,
0x47
,
0x5c
,
0xb4
,
0xf3
,
0xf7
,
0x16
,
0x50
,
0x94
,
0xc2
,
0x29
,
0xc9
,
0xe8
,
0xc4
,
0xdc
,
0x0a
,
0x2a
,
0x5f
,
0xf1
,
0x90
,
0x3e
,
0x50
,
0x15
,
0x11
,
0x22
,
0x13
,
0x76
,
0xa1
,
0xcd
,
0xb8
,
0x36
,
0x4c
,
0x50
,
0x61
,
0xa2
,
0x0c
,
0xae
,
0x74
,
0xbc
,
0x4a
,
0xcd
,
0x76
,
0xce
,
0xb0
,
0xab
,
0xc9
,
0xfd
,
0x32
,
0x17
,
0xef
,
0x9f
,
0x8c
,
0x90
,
0xbe
,
0x40
,
0x2d
,
0xdf
,
0x6d
,
0x86
,
0x97
,
0xf4
,
0xf8
,
0x80
,
0xdf
,
0xf1
,
0x5b
,
0xfb
,
0x7a
,
0x6b
,
0x28
,
0x24
,
0x1e
,
0xc8
,
0xfe
,
0x18
,
0x3c
,
0x2d
,
0x59
,
0xe3
,
0xf9
,
0xdf
,
0xff
,
0x65
,
0x3c
,
0x71
,
0x26
,
0xf0
,
0xac
,
0xb9
,
0xe6
,
0x42
,
0x11
,
0xf4
,
0x2b
,
0xae
,
0x12
,
0xaf
,
0x46
,
0x2b
,
0x10
,
0x70
,
0xbe
,
0xf1
,
0xab
,
0x5e
,
0x36
,
0x06
,
0x87
,
0x2c
,
0xa1
,
0x0d
,
0xee
,
0x15
,
0xb3
,
0x24
,
0x9b
,
0x1a
,
0x1b
,
0x95
,
0x8f
,
0x23
,
0x13
,
0x4c
,
0x4b
,
0xcc
,
0xb7
,
0xd0
,
0x32
,
0x00
,
0xbc
,
0xe4
,
0x20
,
0xa2
,
0xf8
,
0xeb
,
0x66
,
0xdc
,
0xf3
,
0x64
,
0x4d
,
0x14
,
0x23
,
0xc1
,
0xb5
,
0x69
,
0x90
,
0x03
,
0xc1
,
0x3e
,
0xce
,
0xf4
,
0xbf
,
0x38
,
0xa3
,
0xb6
,
0x0e
,
0xed
,
0xc3
,
0x40
,
0x33
,
0xba
,
0xc1
,
0x90
,
0x27
,
0x83
,
0xdc
,
0x6d
,
0x89
,
0xe2
,
0xe7
,
0x74
,
0x18
,
0x8a
,
0x43
,
0x9c
,
0x7e
,
0xbc
,
0xc0
,
0x67
,
0x2d
,
0xbd
,
0xa4
,
0xdd
,
0xcf
,
0xb2
,
0x79
,
0x46
,
0x13
,
0xb0
,
0xbe
,
0x41
,
0x31
,
0x5e
,
0xf7
,
0x78
,
0x70
,
0x8a
,
0x70
,
0xee
,
0x7d
,
0x75
,
0x16
,
0x5c
},
T20
[]
=
{
0x8b
,
0x30
,
0x7f
,
0x6b
,
0x33
,
0x28
,
0x6d
,
0x0a
,
0xb0
,
0x26
,
0xa9
,
0xed
,
0x3f
,
0xe1
,
0xe8
,
0x5f
};
#define TEST_CASE(n) do { \
u8 out[sizeof(P##n)]; \
AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
...
...
@@ -1763,6 +1804,7 @@ int main()
TEST_CASE
(
17
);
TEST_CASE
(
18
);
TEST_CASE
(
19
);
TEST_CASE
(
20
);
#ifdef OPENSSL_CPUID_OBJ
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录