Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
619b9466
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
1 年多 前同步成功
通知
10
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
619b9466
编写于
6月 11, 2014
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add support for Intel SHA extension.
上级
fd2309aa
变更
8
展开全部
隐藏空白更改
内联
并排
Showing
8 changed file
with
2387 addition
and
52 deletion
+2387
-52
crypto/aes/asm/aesni-sha1-x86_64.pl
crypto/aes/asm/aesni-sha1-x86_64.pl
+228
-22
crypto/aes/asm/aesni-sha256-x86_64.pl
crypto/aes/asm/aesni-sha256-x86_64.pl
+322
-2
crypto/sha/asm/sha1-586.pl
crypto/sha/asm/sha1-586.pl
+120
-0
crypto/sha/asm/sha1-mb-x86_64.pl
crypto/sha/asm/sha1-mb-x86_64.pl
+509
-2
crypto/sha/asm/sha1-x86_64.pl
crypto/sha/asm/sha1-x86_64.pl
+199
-3
crypto/sha/asm/sha256-586.pl
crypto/sha/asm/sha256-586.pl
+150
-1
crypto/sha/asm/sha256-mb-x86_64.pl
crypto/sha/asm/sha256-mb-x86_64.pl
+624
-17
crypto/sha/asm/sha512-x86_64.pl
crypto/sha/asm/sha512-x86_64.pl
+235
-5
未找到文件。
crypto/aes/asm/aesni-sha1-x86_64.pl
浏览文件 @
619b9466
...
...
@@ -118,7 +118,9 @@ $code.=<<___;
aesni_cbc_sha1_enc:
# caller should check for SSSE3 and AES-NI bits
mov OPENSSL_ia32cap_P+0(%rip),%r10d
mov OPENSSL_ia32cap_P+4(%rip),%r11d
mov OPENSSL_ia32cap_P+4(%rip),%r11
bt \$61,%r11 # check SHA bit
jc aesni_cbc_sha1_enc_shaext
___
$code
.=<<
___
if
(
$avx
);
and
\
$
`
1<<28
`,
%r11d
# mask AVX bit
...
...
@@ -200,7 +202,7 @@ $code.=<<___;
mov $in0,%r12 # reassign arguments
mov $out,%r13
mov $len,%r14
mov $key,%r15
lea 112($key),%r15 # size optimization
movdqu ($ivp),$iv # load IV
mov $ivp,88(%rsp) # save $ivp
___
...
...
@@ -209,7 +211,7 @@ my $rounds="${ivp}d";
$code
.=
<<___;
shl \$6,$len
sub $in0,$out
mov 240($key),$rounds
mov 240
-112
($key),$rounds
add $inp,$len # end of input
lea K_XX_XX(%rip),$K_XX_XX
...
...
@@ -243,8 +245,8 @@ $code.=<<___;
psubd @Tx[1],@X[-3&7]
movdqa @X[-2&7],32(%rsp)
psubd @Tx[1],@X[-2&7]
movups
($key),$rndkey0
# $key[0]
movups 16($key),$rndkey[0] # forward reference
movups
-112($key),$rndkey0
# $key[0]
movups 16
-112
($key),$rndkey[0] # forward reference
jmp .Loop_ssse3
___
...
...
@@ -261,31 +263,31 @@ ___
___
$code
.=
<<___;
xorps $in,$iv
movups `32+16*$k-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*$k`($key),$rndkey[1]
___
}
elsif
(
$k
==
9
)
{
$sn
++
;
$code
.=
<<___;
cmp \$11,$rounds
jb .Laesenclast$sn
movups `32+16*($k+0)`($key),$rndkey[1]
movups `32+16*($k+0)
-112
`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*($k+1)`($key),$rndkey[0]
movups `32+16*($k+1)
-112
`($key),$rndkey[0]
aesenc $rndkey[1],$iv
je .Laesenclast$sn
movups `32+16*($k+2)`($key),$rndkey[1]
movups `32+16*($k+2)
-112
`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*($k+3)`($key),$rndkey[0]
movups `32+16*($k+3)
-112
`($key),$rndkey[0]
aesenc $rndkey[1],$iv
.Laesenclast$sn:
aesenclast $rndkey[0],$iv
movups 16($key),$rndkey[1] # forward reference
movups 16
-112
($key),$rndkey[1] # forward reference
___
}
else
{
$code
.=
<<___;
movups `32+16*$k-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*$k`($key),$rndkey[1]
___
}
$r
++
;
unshift
(
@rndkey
,
pop
(
@rndkey
));
...
...
@@ -1041,7 +1043,7 @@ $code.=<<___;
mov $in0,%r12 # reassign arguments
mov $out,%r13
mov $len,%r14
mov $key,%r15
lea 112($key),%r15 # size optimization
vmovdqu ($ivp),$iv # load IV
mov $ivp,88(%rsp) # save $ivp
___
...
...
@@ -1050,8 +1052,7 @@ my $rounds="${ivp}d";
$code
.=
<<___;
shl \$6,$len
sub $in0,$out
mov 240($key),$rounds
add \$112,$key # size optimization
mov 240-112($key),$rounds
add $inp,$len # end of input
lea K_XX_XX(%rip),$K_XX_XX
...
...
@@ -1651,11 +1652,180 @@ K_XX_XX:
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
___
{{{
(
$in0
,
$out
,
$len
,
$key
,
$ivp
,
$ctx
,
$inp
)
=
("
%rdi
","
%rsi
","
%rdx
","
%rcx
","
%r8
","
%r9
","
%r10
");
$rounds
=
"
%r11d
";
(
$iv
,
$in
,
$rndkey0
)
=
map
("
%xmm
$_
",(
2
,
14
,
15
));
@rndkey
=
("
%xmm0
","
%xmm1
");
$r
=
0
;
my
(
$BSWAP
,
$ABCD
,
$E
,
$E_
,
$ABCD_SAVE
,
$E_SAVE
)
=
map
("
%xmm
$_
",(
7
..
12
));
my
@MSG
=
map
("
%xmm
$_
",(
3
..
6
));
$code
.=
<<___;
.type aesni_cbc_sha1_enc_shaext,\@function,6
.align 32
aesni_cbc_sha1_enc_shaext:
mov `($win64?56:8)`(%rsp),$inp # load 7th argument
___
$code
.=<<
___
if
(
$win64
);
lea
`
-8-4*16
`(
%rsp
),
%rsp
movaps
%xmm6
,
-
8
-
10
*
16
(
%rax
)
movaps
%xmm7
,
-
8
-
9
*
16
(
%rax
)
movaps
%xmm8
,
-
8
-
8
*
16
(
%rax
)
movaps
%xmm9
,
-
8
-
7
*
16
(
%rax
)
movaps
%xmm10
,
-
8
-
6
*
16
(
%rax
)
movaps
%xmm11
,
-
8
-
5
*
16
(
%rax
)
movaps
%xmm12
,
-
8
-
4
*
16
(
%rax
)
movaps
%xmm13
,
-
8
-
3
*
16
(
%rax
)
movaps
%xmm14
,
-
8
-
2
*
16
(
%rax
)
movaps
%xmm15
,
-
8
-
1
*
16
(
%rax
)
.
Lprologue_shaext:
___
$code
.=
<<___;
movdqu ($ctx),$ABCD
movd 16($ctx),$E
movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap
mov 240($key),$rounds
sub $in0,$out
movups ($key),$rndkey0 # $key[0]
movups 16($key),$rndkey[0] # forward reference
lea 112($key),$key # size optimization
pshufd \$0b00011011,$ABCD,$ABCD # flip word order
pshufd \$0b00011011,$E,$E # flip word order
jmp .Loop_shaext
.align 16
.Loop_shaext:
___
&$aesenc
();
$code
.=
<<___;
movdqu ($inp),@MSG[0]
movdqa $E,$E_SAVE # offload $E
pshufb $BSWAP,@MSG[0]
movdqu 0x10($inp),@MSG[1]
movdqa $ABCD,$ABCD_SAVE # offload $ABCD
___
&$aesenc
();
$code
.=
<<___;
pshufb $BSWAP,@MSG[1]
paddd @MSG[0],$E
movdqu 0x20($inp),@MSG[2]
lea 0x40($inp),$inp
pxor $E_SAVE,@MSG[0] # black magic
___
&$aesenc
();
$code
.=
<<___;
pxor $E_SAVE,@MSG[0] # black magic
movdqa $ABCD,$E_
pshufb $BSWAP,@MSG[2]
sha1rnds4 \$0,$E,$ABCD # 0-3
sha1nexte @MSG[1],$E_
___
&$aesenc
();
$code
.=
<<___;
sha1msg1 @MSG[1],@MSG[0]
movdqu -0x10($inp),@MSG[3]
movdqa $ABCD,$E
pshufb $BSWAP,@MSG[3]
___
&$aesenc
();
$code
.=
<<___;
sha1rnds4 \$0,$E_,$ABCD # 4-7
sha1nexte @MSG[2],$E
pxor @MSG[2],@MSG[0]
sha1msg1 @MSG[2],@MSG[1]
___
&$aesenc
();
for
(
$i
=
2
;
$i
<
20
-
4
;
$i
++
)
{
$code
.=
<<___;
movdqa $ABCD,$E_
sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11
sha1nexte @MSG[3],$E_
___
&$aesenc
();
$code
.=
<<___;
sha1msg2 @MSG[3],@MSG[0]
pxor @MSG[3],@MSG[1]
sha1msg1 @MSG[3],@MSG[2]
___
(
$E
,
$E_
)
=
(
$E_
,
$E
);
push
(
@MSG
,
shift
(
@MSG
));
&$aesenc
();
}
$code
.=
<<___;
movdqa $ABCD,$E_
sha1rnds4 \$3,$E,$ABCD # 64-67
sha1nexte @MSG[3],$E_
sha1msg2 @MSG[3],@MSG[0]
pxor @MSG[3],@MSG[1]
___
&$aesenc
();
$code
.=
<<___;
movdqa $ABCD,$E
sha1rnds4 \$3,$E_,$ABCD # 68-71
sha1nexte @MSG[0],$E
sha1msg2 @MSG[0],@MSG[1]
___
&$aesenc
();
$code
.=
<<___;
movdqa $E_SAVE,@MSG[0]
movdqa $ABCD,$E_
sha1rnds4 \$3,$E,$ABCD # 72-75
sha1nexte @MSG[1],$E_
___
&$aesenc
();
$code
.=
<<___;
movdqa $ABCD,$E
sha1rnds4 \$3,$E_,$ABCD # 76-79
sha1nexte $MSG[0],$E
___
while
(
$r
<
40
)
{
&$aesenc
();
}
# remaining aesenc's
$code
.=
<<___;
dec $len
paddd $ABCD_SAVE,$ABCD
movups $iv,48($out,$in0) # write output
lea 64($in0),$in0
jnz .Loop_shaext
pshufd \$0b00011011,$ABCD,$ABCD
pshufd \$0b00011011,$E,$E
movups $iv,($ivp) # write IV
movdqu $ABCD,($ctx)
movd $E,16($ctx)
___
$code
.=<<
___
if
(
$win64
);
movaps
-
8
-
10
*
16
(
%rax
),
%xmm6
movaps
-
8
-
9
*
16
(
%rax
),
%xmm7
movaps
-
8
-
8
*
16
(
%rax
),
%xmm8
movaps
-
8
-
7
*
16
(
%rax
),
%xmm9
movaps
-
8
-
6
*
16
(
%rax
),
%xmm10
movaps
-
8
-
5
*
16
(
%rax
),
%xmm11
movaps
-
8
-
4
*
16
(
%rax
),
%xmm12
movaps
-
8
-
3
*
16
(
%rax
),
%xmm13
movaps
-
8
-
2
*
16
(
%rax
),
%xmm14
movaps
-
8
-
1
*
16
(
%rax
),
%xmm15
mov
%rax
,
%rsp
.
Lepilogue_shaext:
___
$code
.=
<<___;
ret
.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
___
}}}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if
(
$win64
)
{
...
...
@@ -1793,12 +1963,43 @@ sub rex {
$rex
|=
0x04
if
(
$dst
>=
8
);
$rex
|=
0x01
if
(
$src
>=
8
);
push
@opcode
,
$rex
|
0x40
if
(
$rex
);
unshift
@opcode
,
$rex
|
0x40
if
(
$rex
);
}
sub
sha1rnds4
{
if
(
@_
[
0
]
=~
/\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/
)
{
my
@opcode
=
(
0x0f
,
0x3a
,
0xcc
);
rex
(
\
@opcode
,
$
3
,
$
2
);
push
@opcode
,
0xc0
|
(
$
2
&
7
)
|
((
$
3
&
7
)
<<
3
);
# ModR/M
my
$c
=
$
1
;
push
@opcode
,
$c
=~
/^0/
?
oct
(
$c
):
$c
;
return
"
.byte
\t
"
.
join
('
,
',
@opcode
);
}
else
{
return
"
sha1rnds4
\t
"
.
@_
[
0
];
}
}
sub
sha1op38
{
my
$instr
=
shift
;
my
%opcodelet
=
(
"
sha1nexte
"
=>
0xc8
,
"
sha1msg1
"
=>
0xc9
,
"
sha1msg2
"
=>
0xca
);
if
(
defined
(
$opcodelet
{
$instr
})
&&
@_
[
0
]
=~
/%xmm([0-9]+),\s*%xmm([0-9]+)/
)
{
my
@opcode
=
(
0x0f
,
0x38
);
rex
(
\
@opcode
,
$
2
,
$
1
);
push
@opcode
,
$opcodelet
{
$instr
};
push
@opcode
,
0xc0
|
(
$
1
&
7
)
|
((
$
2
&
7
)
<<
3
);
# ModR/M
return
"
.byte
\t
"
.
join
('
,
',
@opcode
);
}
else
{
return
$instr
.
"
\t
"
.
@_
[
0
];
}
}
sub
aesni
{
my
$line
=
shift
;
my
@opcode
=
(
0x
66
);
my
@opcode
=
(
0x
0f
,
0x38
);
if
(
$line
=~
/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/
)
{
my
%opcodelet
=
(
...
...
@@ -1807,15 +2008,20 @@ sub aesni {
);
return
undef
if
(
!
defined
(
$opcodelet
{
$
1
}));
rex
(
\
@opcode
,
$
3
,
$
2
);
push
@opcode
,
0x0f
,
0x38
,
$opcodelet
{
$
1
};
push
@opcode
,
0xc0
|
(
$
2
&
7
)
|
((
$
3
&
7
)
<<
3
);
# ModR/M
push
@opcode
,
$opcodelet
{
$
1
},
0xc0
|
(
$
2
&
7
)
|
((
$
3
&
7
)
<<
3
);
# ModR/M
unshift
@opcode
,
0x66
;
return
"
.byte
\t
"
.
join
('
,
',
@opcode
);
}
return
$line
;
}
$code
=~
s/\`([^\`]*)\`/eval($1)/g
em
;
$code
=~
s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/g
em
;
foreach
(
split
("
\n
",
$code
))
{
s/\`([^\`]*)\`/eval $1/g
eo
;
print
$code
;
s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/g
eo
or
s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/g
eo
or
s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/g
eo
;
print
$_
,"
\n
";
}
close
STDOUT
;
crypto/aes/asm/aesni-sha256-x86_64.pl
浏览文件 @
619b9466
...
...
@@ -112,8 +112,13 @@ $code.=<<___ if ($avx);
cmp
\
$
0
,`
$win64
?"%rcx":"%rdi"
`
je
.
Lprobe
mov
0
(
%r11
),
%eax
mov
4
(
%r11
),
%r10d
mov
8
(
%r11
),
%r11d
mov
4
(
%r11
),
%r10
bt
\
$
61
,
%r10
# check for SHA
jc
$
{
func
}
_shaext
mov
%r10
,
%r11
shr
\
$
32
,
%r11
test
\
$
`
1<<11
`,
%r10d
# check for XOP
jnz
$
{
func
}
_xop
...
...
@@ -1196,6 +1201,288 @@ $code.=<<___;
.size ${func}_avx2,.-${func}_avx2
___
}}
}}
{{
my
(
$in0
,
$out
,
$len
,
$key
,
$ivp
,
$ctx
,
$inp
)
=
("
%rdi
","
%rsi
","
%rdx
","
%rcx
","
%r8
","
%r9
","
%r10
");
my
(
$rounds
,
$Tbl
)
=
("
%r11d
","
%rbx
");
my
(
$iv
,
$in
,
$rndkey0
)
=
map
("
%xmm
$_
",(
6
,
14
,
15
));
my
@rndkey
=
("
%xmm4
","
%xmm5
");
my
$r
=
0
;
my
$sn
=
0
;
my
(
$Wi
,
$ABEF
,
$CDGH
,
$TMP
,
$BSWAP
,
$ABEF_SAVE
,
$CDGH_SAVE
)
=
map
("
%xmm
$_
",(
0
..
3
,
7
..
9
));
my
@MSG
=
map
("
%xmm
$_
",(
10
..
13
));
my
$aesenc
=
sub
{
use
integer
;
my
(
$n
,
$k
)
=
(
$r
/
10
,
$r
%
10
);
if
(
$k
==
0
)
{
$code
.=
<<___;
movups `16*$n`($in0),$in # load input
xorps $rndkey0,$in
___
$code
.=<<
___
if
(
$n
);
movups
$iv
,`
16*(
$n
-1)
`(
$out
,
$in0
)
# write output
___
$code
.=
<<___;
xorps $in,$iv
movups `32+16*$k-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
___
}
elsif
(
$k
==
9
)
{
$sn
++
;
$code
.=
<<___;
cmp \$11,$rounds
jb .Laesenclast$sn
movups `32+16*($k+0)-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*($k+1)-112`($key),$rndkey[0]
aesenc $rndkey[1],$iv
je .Laesenclast$sn
movups `32+16*($k+2)-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
movups `32+16*($k+3)-112`($key),$rndkey[0]
aesenc $rndkey[1],$iv
.Laesenclast$sn:
aesenclast $rndkey[0],$iv
movups 16-112($key),$rndkey[1] # forward reference
nop
___
}
else
{
$code
.=
<<___;
movups `32+16*$k-112`($key),$rndkey[1]
aesenc $rndkey[0],$iv
___
}
$r
++
;
unshift
(
@rndkey
,
pop
(
@rndkey
));
};
$code
.=
<<___;
.type ${func}_shaext,\@function,6
.align 32
${func}_shaext:
mov %rsp,%rax
mov `($win64?56:8)`(%rsp),$inp # load 7th argument
push %rbx
___
$code
.=<<
___
if
(
$win64
);
lea
`
-4*16
`(
%rsp
),
%rsp
movaps
%xmm6
,
-
8
-
10
*
16
(
%rax
)
movaps
%xmm7
,
-
8
-
9
*
16
(
%rax
)
movaps
%xmm8
,
-
8
-
8
*
16
(
%rax
)
movaps
%xmm9
,
-
8
-
7
*
16
(
%rax
)
movaps
%xmm10
,
-
8
-
6
*
16
(
%rax
)
movaps
%xmm11
,
-
8
-
5
*
16
(
%rax
)
movaps
%xmm12
,
-
8
-
4
*
16
(
%rax
)
movaps
%xmm13
,
-
8
-
3
*
16
(
%rax
)
movaps
%xmm14
,
-
8
-
2
*
16
(
%rax
)
movaps
%xmm15
,
-
8
-
1
*
16
(
%rax
)
.
Lprologue_shaext:
___
$code
.=
<<___;
lea K256+0x80(%rip),$Tbl
movdqu ($ctx),$ABEF # DCBA
movdqu 16($ctx),$CDGH # HGFE
movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
mov 240($key),$rounds
sub $in0,$out
movups ($key),$rndkey0 # $key[0]
movups 16($key),$rndkey[0] # forward reference
lea 112($key),$key # size optimization
pshufd \$0x1b,$ABEF,$Wi # ABCD
pshufd \$0xb1,$ABEF,$ABEF # CDAB
pshufd \$0x1b,$CDGH,$CDGH # EFGH
movdqa $TMP,$BSWAP # offload
palignr \$8,$CDGH,$ABEF # ABEF
punpcklqdq $Wi,$CDGH # CDGH
jmp .Loop_shaext
.align 16
.Loop_shaext:
movdqu ($inp),@MSG[0]
movdqu 0x10($inp),@MSG[1]
movdqu 0x20($inp),@MSG[2]
pshufb $TMP,@MSG[0]
movdqu 0x30($inp),@MSG[3]
movdqa 0*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
pshufb $TMP,@MSG[1]
movdqa $CDGH,$CDGH_SAVE # offload
movdqa $ABEF,$ABEF_SAVE # offload
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $ABEF,$CDGH # 0-3
pshufd \$0x0e,$Wi,$Wi
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $CDGH,$ABEF
movdqa 1*32-0x80($Tbl),$Wi
paddd @MSG[1],$Wi
pshufb $TMP,@MSG[2]
lea 0x40($inp),$inp
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $ABEF,$CDGH # 4-7
pshufd \$0x0e,$Wi,$Wi
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $CDGH,$ABEF
movdqa 2*32-0x80($Tbl),$Wi
paddd @MSG[2],$Wi
pshufb $TMP,@MSG[3]
sha256msg1 @MSG[1],@MSG[0]
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $ABEF,$CDGH # 8-11
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[3],$TMP
palignr \$4,@MSG[2],$TMP
paddd $TMP,@MSG[0]
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $CDGH,$ABEF
movdqa 3*32-0x80($Tbl),$Wi
paddd @MSG[3],$Wi
sha256msg2 @MSG[3],@MSG[0]
sha256msg1 @MSG[2],@MSG[1]
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $ABEF,$CDGH # 12-15
pshufd \$0x0e,$Wi,$Wi
___
&$aesenc
();
$code
.=
<<___;
movdqa @MSG[0],$TMP
palignr \$4,@MSG[3],$TMP
paddd $TMP,@MSG[1]
sha256rnds2 $CDGH,$ABEF
___
for
(
$i
=
4
;
$i
<
16
-
3
;
$i
++
)
{
&$aesenc
()
if
((
$r
%
10
)
==
0
);
$code
.=
<<___;
movdqa $i*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
sha256msg2 @MSG[0],@MSG[1]
sha256msg1 @MSG[3],@MSG[2]
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $ABEF,$CDGH # 16-19...
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[1],$TMP
palignr \$4,@MSG[0],$TMP
paddd $TMP,@MSG[2]
___
&$aesenc
();
&$aesenc
()
if
(
$r
==
19
);
$code
.=
<<___;
sha256rnds2 $CDGH,$ABEF
___
push
(
@MSG
,
shift
(
@MSG
));
}
$code
.=
<<___;
movdqa 13*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
sha256msg2 @MSG[0],@MSG[1]
sha256msg1 @MSG[3],@MSG[2]
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $ABEF,$CDGH # 52-55
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[1],$TMP
palignr \$4,@MSG[0],$TMP
paddd $TMP,@MSG[2]
___
&$aesenc
();
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $CDGH,$ABEF
movdqa 14*32-0x80($Tbl),$Wi
paddd @MSG[1],$Wi
sha256msg2 @MSG[1],@MSG[2]
movdqa $BSWAP,$TMP
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $ABEF,$CDGH # 56-59
pshufd \$0x0e,$Wi,$Wi
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $CDGH,$ABEF
movdqa 15*32-0x80($Tbl),$Wi
paddd @MSG[2],$Wi
___
&$aesenc
();
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $ABEF,$CDGH # 60-63
pshufd \$0x0e,$Wi,$Wi
___
&$aesenc
();
$code
.=
<<___;
sha256rnds2 $CDGH,$ABEF
#pxor $CDGH,$rndkey0 # black magic
___
while
(
$r
<
40
)
{
&$aesenc
();
}
# remaining aesenc's
$code
.=
<<___;
#xorps $CDGH,$rndkey0 # black magic
paddd $CDGH_SAVE,$CDGH
paddd $ABEF_SAVE,$ABEF
dec $len
movups $iv,48($out,$in0) # write output
lea 64($in0),$in0
jnz .Loop_shaext
pshufd \$0xb1,$CDGH,$CDGH # DCHG
pshufd \$0x1b,$ABEF,$TMP # FEBA
pshufd \$0xb1,$ABEF,$ABEF # BAFE
punpckhqdq $CDGH,$ABEF # DCBA
palignr \$8,$TMP,$CDGH # HGFE
movups $iv,($ivp) # write IV
movdqu $ABEF,($ctx)
movdqu $CDGH,16($ctx)
___
$code
.=<<
___
if
(
$win64
);
movaps
-
8
-
10
*
16
(
%rax
),
%xmm6
movaps
-
8
-
9
*
16
(
%rax
),
%xmm7
movaps
-
8
-
8
*
16
(
%rax
),
%xmm8
movaps
-
8
-
7
*
16
(
%rax
),
%xmm9
movaps
-
8
-
6
*
16
(
%rax
),
%xmm10
movaps
-
8
-
5
*
16
(
%rax
),
%xmm11
movaps
-
8
-
4
*
16
(
%rax
),
%xmm12
movaps
-
8
-
3
*
16
(
%rax
),
%xmm13
movaps
-
8
-
2
*
16
(
%rax
),
%xmm14
movaps
-
8
-
1
*
16
(
%rax
),
%xmm15
.
Lepilogue_shaext:
___
$code
.=
<<___;
mov -8(%rax),%rbx
mov %rax,%rsp
ret
.size ${func}_shaext,.-${func}_shaext
___
}}}}}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
...
...
@@ -1347,6 +1634,39 @@ $code.=<<___ if ($avx>1);
___
}
####################################################################
sub
rex
{
local
*opcode
=
shift
;
my
(
$dst
,
$src
)
=
@_
;
my
$rex
=
0
;
$rex
|=
0x04
if
(
$dst
>=
8
);
$rex
|=
0x01
if
(
$src
>=
8
);
unshift
@opcode
,
$rex
|
0x40
if
(
$rex
);
}
{
my
%opcodelet
=
(
"
sha256rnds2
"
=>
0xcb
,
"
sha256msg1
"
=>
0xcc
,
"
sha256msg2
"
=>
0xcd
);
sub
sha256op38
{
my
$instr
=
shift
;
if
(
defined
(
$opcodelet
{
$instr
})
&&
@_
[
0
]
=~
/%xmm([0-7]),\s*%xmm([0-7])/
)
{
my
@opcode
=
(
0x0f
,
0x38
);
rex
(
\
@opcode
,
$
2
,
$
1
);
push
@opcode
,
$opcodelet
{
$instr
};
push
@opcode
,
0xc0
|
(
$
1
&
7
)
|
((
$
2
&
7
)
<<
3
);
# ModR/M
return
"
.byte
\t
"
.
join
('
,
',
@opcode
);
}
else
{
return
$instr
.
"
\t
"
.
@_
[
0
];
}
}
}
$code
=~
s/\`([^\`]*)\`/eval $1/g
em
;
$code
=~
s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/g
em
;
print
$code
;
close
STDOUT
;
crypto/sha/asm/sha1-586.pl
浏览文件 @
619b9466
...
...
@@ -79,6 +79,10 @@
# strongly, it's probably more appropriate to discuss possibility of
# using vector rotate XOP on AMD...
# March 2014.
#
# Add support for Intel SHA Extensions.
######################################################################
# Current performance is summarized in following table. Numbers are
# CPU clock cycles spent to process single byte (less is better).
...
...
@@ -303,6 +307,7 @@ if ($alt) {
&function_begin
("
sha1_block_data_order
");
if
(
$xmm
)
{
&static_label
("
shaext_shortcut
");
&static_label
("
ssse3_shortcut
");
&static_label
("
avx_shortcut
")
if
(
$ymm
);
&static_label
("
K_XX_XX
");
...
...
@@ -317,8 +322,11 @@ if ($xmm) {
&mov
(
$D
,
&DWP
(
4
,
$T
));
&test
(
$D
,
1
<<
9
);
# check SSSE3 bit
&jz
(
&label
("
x86
"));
&mov
(
$C
,
&DWP
(
8
,
$T
));
&test
(
$A
,
1
<<
24
);
# check FXSR bit
&jz
(
&label
("
x86
"));
&test
(
$C
,
1
<<
29
);
# check SHA bit
&jnz
(
&label
("
shaext_shortcut
"));
if
(
$ymm
)
{
&and
(
$D
,
1
<<
28
);
# mask AVX bit
&and
(
$A
,
1
<<
30
);
# mask "Intel CPU" bit
...
...
@@ -397,6 +405,117 @@ if ($xmm) {
&function_end
("
sha1_block_data_order
");
if
(
$xmm
)
{
{
######################################################################
# Intel SHA Extensions implementation of SHA1 update function.
#
my
(
$ctx
,
$inp
,
$num
)
=
("
edi
","
esi
","
ecx
");
my
(
$ABCD
,
$E
,
$E_
,
$BSWAP
)
=
map
("
xmm
$_
",(
0
..
3
));
my
@MSG
=
map
("
xmm
$_
",(
4
..
7
));
sub
sha1rnds4
{
my
(
$dst
,
$src
,
$imm
)
=
@_
;
if
("
$dst
:
$src
"
=~
/xmm([0-7]):xmm([0-7])/
)
{
&data_byte
(
0x0f
,
0x3a
,
0xcc
,
0xc0
|
(
$
1
<<
3
)
|
$
2
,
$imm
);
}
}
sub
sha1op38
{
my
(
$opcodelet
,
$dst
,
$src
)
=
@_
;
if
("
$dst
:
$src
"
=~
/xmm([0-7]):xmm([0-7])/
)
{
&data_byte
(
0x0f
,
0x38
,
$opcodelet
,
0xc0
|
(
$
1
<<
3
)
|
$
2
);
}
}
sub
sha1nexte
{
sha1op38
(
0xc8
,
@
_
);
}
sub
sha1msg1
{
sha1op38
(
0xc9
,
@
_
);
}
sub
sha1msg2
{
sha1op38
(
0xca
,
@
_
);
}
&function_begin
("
_sha1_block_data_order_shaext
");
&call
(
&label
("
pic_point
"));
# make it PIC!
&set_label
("
pic_point
");
&blindpop
(
$tmp1
);
&lea
(
$tmp1
,
&DWP
(
&label
("
K_XX_XX
")
.
"
-
"
.
&label
("
pic_point
"),
$tmp1
));
&set_label
("
shaext_shortcut
");
&mov
(
$ctx
,
&wparam
(
0
));
&mov
("
ebx
","
esp
");
&mov
(
$inp
,
&wparam
(
1
));
&mov
(
$num
,
&wparam
(
2
));
&sub
("
esp
",
32
);
&movdqu
(
$ABCD
,
&QWP
(
0
,
$ctx
));
&movd
(
$E
,
&QWP
(
16
,
$ctx
));
&and
("
esp
",
-
32
);
&movdqa
(
$BSWAP
,
&QWP
(
0x50
,
$tmp1
));
# byte-n-word swap
&movdqu
(
@MSG
[
0
],
&QWP
(
0
,
$inp
));
&pshufd
(
$ABCD
,
$ABCD
,
0b00011011
);
# flip word order
&movdqu
(
@MSG
[
1
],
&QWP
(
0x10
,
$inp
));
&pshufd
(
$E
,
$E
,
0b00011011
);
# flip word order
&movdqu
(
@MSG
[
2
],
&QWP
(
0x20
,
$inp
));
&pshufb
(
@MSG
[
0
],
$BSWAP
);
&movdqu
(
@MSG
[
3
],
&QWP
(
0x30
,
$inp
));
&pshufb
(
@MSG
[
1
],
$BSWAP
);
&pshufb
(
@MSG
[
2
],
$BSWAP
);
&pshufb
(
@MSG
[
3
],
$BSWAP
);
&jmp
(
&label
("
loop_shaext
"));
&set_label
("
loop_shaext
",
16
);
&dec
(
$num
);
&lea
("
eax
",
&DWP
(
0x40
,
$inp
));
&movdqa
(
&QWP
(
0
,"
esp
"),
$E
);
# offload $E
&paddd
(
$E
,
@MSG
[
0
]);
&cmovne
(
$inp
,"
eax
");
&movdqa
(
&QWP
(
16
,"
esp
"),
$ABCD
);
# offload $ABCD
for
(
$i
=
0
;
$i
<
20
-
4
;
$i
+=
2
)
{
&sha1msg1
(
@MSG
[
0
],
@MSG
[
1
]);
&movdqa
(
$E_
,
$ABCD
);
&sha1rnds4
(
$ABCD
,
$E
,
int
(
$i
/
5
));
# 0-3...
&sha1nexte
(
$E_
,
@MSG
[
1
]);
&pxor
(
@MSG
[
0
],
@MSG
[
2
]);
&sha1msg1
(
@MSG
[
1
],
@MSG
[
2
]);
&sha1msg2
(
@MSG
[
0
],
@MSG
[
3
]);
&movdqa
(
$E
,
$ABCD
);
&sha1rnds4
(
$ABCD
,
$E_
,
int
((
$i
+
1
)
/
5
));
&sha1nexte
(
$E
,
@MSG
[
2
]);
&pxor
(
@MSG
[
1
],
@MSG
[
3
]);
&sha1msg2
(
@MSG
[
1
],
@MSG
[
0
]);
push
(
@MSG
,
shift
(
@MSG
));
push
(
@MSG
,
shift
(
@MSG
));
}
&movdqu
(
@MSG
[
0
],
&QWP
(
0
,
$inp
));
&movdqa
(
$E_
,
$ABCD
);
&sha1rnds4
(
$ABCD
,
$E
,
3
);
# 64-67
&sha1nexte
(
$E_
,
@MSG
[
1
]);
&movdqu
(
@MSG
[
1
],
&QWP
(
0x10
,
$inp
));
&pshufb
(
@MSG
[
0
],
$BSWAP
);
&movdqa
(
$E
,
$ABCD
);
&sha1rnds4
(
$ABCD
,
$E_
,
3
);
# 68-71
&sha1nexte
(
$E
,
@MSG
[
2
]);
&movdqu
(
@MSG
[
2
],
&QWP
(
0x20
,
$inp
));
&pshufb
(
@MSG
[
1
],
$BSWAP
);
&movdqa
(
$E_
,
$ABCD
);
&sha1rnds4
(
$ABCD
,
$E
,
3
);
# 72-75
&sha1nexte
(
$E_
,
@MSG
[
3
]);
&movdqu
(
@MSG
[
3
],
&QWP
(
0x30
,
$inp
));
&pshufb
(
@MSG
[
2
],
$BSWAP
);
&movdqa
(
$E
,
$ABCD
);
&sha1rnds4
(
$ABCD
,
$E_
,
3
);
# 76-79
&movdqa
(
$E_
,
&QWP
(
0
,"
esp
"));
&pshufb
(
@MSG
[
3
],
$BSWAP
);
&sha1nexte
(
$E
,
$E_
);
&paddd
(
$ABCD
,
&QWP
(
16
,"
esp
"));
&jnz
(
&label
("
loop_shaext
"));
&pshufd
(
$ABCD
,
$ABCD
,
0b00011011
);
&pshufd
(
$E
,
$E
,
0b00011011
);
&movdqu
(
&QWP
(
0
,
$ctx
),
$ABCD
)
&movd
(
&DWP
(
16
,
$ctx
),
$E
);
&mov
("
esp
","
ebx
");
&function_end
("
_sha1_block_data_order_shaext
");
}
######################################################################
# The SSSE3 implementation.
#
...
...
@@ -1340,6 +1459,7 @@ sub Xtail_avx()
&data_word
(
0x8f1bbcdc
,
0x8f1bbcdc
,
0x8f1bbcdc
,
0x8f1bbcdc
);
# K_40_59
&data_word
(
0xca62c1d6
,
0xca62c1d6
,
0xca62c1d6
,
0xca62c1d6
);
# K_60_79
&data_word
(
0x00010203
,
0x04050607
,
0x08090a0b
,
0x0c0d0e0f
);
# pbswap mask
&data_byte
(
0xf
,
0xe
,
0xd
,
0xc
,
0xb
,
0xa
,
0x9
,
0x8
,
0x7
,
0x6
,
0x5
,
0x4
,
0x3
,
0x2
,
0x1
,
0x0
);
}
&asciz
("
SHA1 block transform for x86, CRYPTOGAMS by <appro
\@
openssl.org>
");
...
...
crypto/sha/asm/sha1-mb-x86_64.pl
浏览文件 @
619b9466
...
...
@@ -15,7 +15,7 @@
# this +aesni(i) sha1 aesni-sha1 gain(iv)
# -------------------------------------------------------------------
# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68%
# Atom(ii) 18.
9?/n +3.93=8.66(n=4) 10.0 14.0 +62
%
# Atom(ii) 18.
1/n +3.93=8.46(n=4) 9.37 12.8 +51
%
# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68%
# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
...
...
@@ -338,9 +338,11 @@ $code.=<<___;
.type sha1_multi_block,\@function,3
.align 32
sha1_multi_block:
mov OPENSSL_ia32cap_P+4(%rip),%rcx
bt \$61,%rcx # check SHA bit
jc _shaext_shortcut
___
$code
.=<<
___
if
(
$avx
);
mov
OPENSSL_ia32cap_P
+
4
(
%rip
),
%rcx
test
\
$
`
1<<28
`,
%ecx
jnz
_avx_shortcut
___
...
...
@@ -366,6 +368,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`,%rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody:
lea K_XX_XX(%rip),$Tbl
lea `$REG_SZ*16`(%rsp),%rbx
...
...
@@ -476,9 +479,265 @@ $code.=<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue:
ret
.size sha1_multi_block,.-sha1_multi_block
___
{{{
my
(
$ABCD0
,
$E0
,
$E0_
,
$BSWAP
,
$ABCD1
,
$E1
,
$E1_
)
=
map
("
%xmm
$_
",(
0
..
3
,
8
..
10
));
my
@MSG0
=
map
("
%xmm
$_
",(
4
..
7
));
my
@MSG1
=
map
("
%xmm
$_
",(
11
..
14
));
$code
.=
<<___;
.type sha1_multi_block_shaext,\@function,3
.align 32
sha1_multi_block_shaext:
_shaext_shortcut:
mov %rsp,%rax
push %rbx
push %rbp
___
$code
.=<<
___
if
(
$win64
);
lea
-
0xa8
(
%rsp
),
%rsp
movaps
%xmm6
,(
%rsp
)
movaps
%xmm7
,
0x10
(
%rsp
)
movaps
%xmm8
,
0x20
(
%rsp
)
movaps
%xmm9
,
0x30
(
%rsp
)
movaps
%xmm10
,
-
0x78
(
%rax
)
movaps
%xmm11
,
-
0x68
(
%rax
)
movaps
%xmm12
,
-
0x58
(
%rax
)
movaps
%xmm13
,
-
0x48
(
%rax
)
movaps
%xmm14
,
-
0x38
(
%rax
)
movaps
%xmm15
,
-
0x28
(
%rax
)
___
$code
.=
<<___;
sub \$`$REG_SZ*18`,%rsp
shl \$1,$num # we process pair at a time
and \$-256,%rsp
lea 0x40($ctx),$ctx # size optimization
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody_shaext:
lea `$REG_SZ*16`(%rsp),%rbx
movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap
.Loop_grande_shaext:
mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
xor $num,$num
___
for
(
$i
=
0
;
$i
<
2
;
$i
++
)
{
$code
.=
<<___;
mov `16*$i+0`($inp),@ptr[$i] # input pointer
mov `16*$i+8`($inp),%ecx # number of blocks
cmp $num,%ecx
cmovg %ecx,$num # find maximum
test %ecx,%ecx
mov %ecx,`4*$i`(%rbx) # initialize counters
cmovle %rsp,@ptr[$i] # cancel input
___
}
$code
.=
<<___;
test $num,$num
jz .Ldone_shaext
movq 0x00-0x40($ctx),$ABCD0 # a1.a0
movq 0x20-0x40($ctx),@MSG0[0]# b1.b0
movq 0x40-0x40($ctx),@MSG0[1]# c1.c0
movq 0x60-0x40($ctx),@MSG0[2]# d1.d0
movq 0x80-0x40($ctx),@MSG0[3]# e1.e0
punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0
punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0
movdqa $ABCD0,$ABCD1
punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0
punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1
pshufd \$0b00111111,@MSG0[3],$E0
pshufd \$0b01111111,@MSG0[3],$E1
pshufd \$0b00011011,$ABCD0,$ABCD0
pshufd \$0b00011011,$ABCD1,$ABCD1
jmp .Loop_shaext
.align 32
.Loop_shaext:
movdqu 0x00(@ptr[0]),@MSG0[0]
movdqu 0x00(@ptr[1]),@MSG1[0]
movdqu 0x10(@ptr[0]),@MSG0[1]
movdqu 0x10(@ptr[1]),@MSG1[1]
movdqu 0x20(@ptr[0]),@MSG0[2]
pshufb $BSWAP,@MSG0[0]
movdqu 0x20(@ptr[1]),@MSG1[2]
pshufb $BSWAP,@MSG1[0]
movdqu 0x30(@ptr[0]),@MSG0[3]
lea 0x40(@ptr[0]),@ptr[0]
pshufb $BSWAP,@MSG0[1]
movdqu 0x30(@ptr[1]),@MSG1[3]
lea 0x40(@ptr[1]),@ptr[1]
pshufb $BSWAP,@MSG1[1]
movdqa $E0,0x50(%rsp) # offload
paddd @MSG0[0],$E0
movdqa $E1,0x70(%rsp)
paddd @MSG1[0],$E1
movdqa $ABCD0,0x40(%rsp) # offload
movdqa $ABCD0,$E0_
movdqa $ABCD1,0x60(%rsp)
movdqa $ABCD1,$E1_
sha1rnds4 \$0,$E0,$ABCD0 # 0-3
sha1nexte @MSG0[1],$E0_
sha1rnds4 \$0,$E1,$ABCD1 # 0-3
sha1nexte @MSG1[1],$E1_
pshufb $BSWAP,@MSG0[2]
prefetcht0 127(@ptr[0])
sha1msg1 @MSG0[1],@MSG0[0]
pshufb $BSWAP,@MSG1[2]
prefetcht0 127(@ptr[1])
sha1msg1 @MSG1[1],@MSG1[0]
pshufb $BSWAP,@MSG0[3]
movdqa $ABCD0,$E0
pshufb $BSWAP,@MSG1[3]
movdqa $ABCD1,$E1
sha1rnds4 \$0,$E0_,$ABCD0 # 4-7
sha1nexte @MSG0[2],$E0
sha1rnds4 \$0,$E1_,$ABCD1 # 4-7
sha1nexte @MSG1[2],$E1
pxor @MSG0[2],@MSG0[0]
sha1msg1 @MSG0[2],@MSG0[1]
pxor @MSG1[2],@MSG1[0]
sha1msg1 @MSG1[2],@MSG1[1]
___
for
(
$i
=
2
;
$i
<
20
-
4
;
$i
++
)
{
$code
.=
<<___;
movdqa $ABCD0,$E0_
movdqa $ABCD1,$E1_
sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11
sha1nexte @MSG0[3],$E0_
sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11
sha1nexte @MSG1[3],$E1_
sha1msg2 @MSG0[3],@MSG0[0]
sha1msg2 @MSG1[3],@MSG1[0]
pxor @MSG0[3],@MSG0[1]
sha1msg1 @MSG0[3],@MSG0[2]
pxor @MSG1[3],@MSG1[1]
sha1msg1 @MSG1[3],@MSG1[2]
___
(
$E0
,
$E0_
)
=
(
$E0_
,
$E0
);
(
$E1
,
$E1_
)
=
(
$E1_
,
$E1
);
push
(
@MSG0
,
shift
(
@MSG0
));
push
(
@MSG1
,
shift
(
@MSG1
));
}
$code
.=
<<___;
movdqa $ABCD0,$E0_
movdqa $ABCD1,$E1_
sha1rnds4 \$3,$E0,$ABCD0 # 64-67
sha1nexte @MSG0[3],$E0_
sha1rnds4 \$3,$E1,$ABCD1 # 64-67
sha1nexte @MSG1[3],$E1_
sha1msg2 @MSG0[3],@MSG0[0]
sha1msg2 @MSG1[3],@MSG1[0]
pxor @MSG0[3],@MSG0[1]
pxor @MSG1[3],@MSG1[1]
mov \$1,%ecx
pxor @MSG0[2],@MSG0[2] # zero
cmp 4*0(%rbx),%ecx # examine counters
cmovge %rsp,@ptr[0] # cancel input
movdqa $ABCD0,$E0
movdqa $ABCD1,$E1
sha1rnds4 \$3,$E0_,$ABCD0 # 68-71
sha1nexte @MSG0[0],$E0
sha1rnds4 \$3,$E1_,$ABCD1 # 68-71
sha1nexte @MSG1[0],$E1
sha1msg2 @MSG0[0],@MSG0[1]
sha1msg2 @MSG1[0],@MSG1[1]
cmp 4*1(%rbx),%ecx
cmovge %rsp,@ptr[1]
movq (%rbx),@MSG0[0] # pull counters
movdqa $ABCD0,$E0_
movdqa $ABCD1,$E1_
sha1rnds4 \$3,$E0,$ABCD0 # 72-75
sha1nexte @MSG0[1],$E0_
sha1rnds4 \$3,$E1,$ABCD1 # 72-75
sha1nexte @MSG1[1],$E1_
pshufd \$0x00,@MSG0[0],@MSG1[2]
pshufd \$0x55,@MSG0[0],@MSG1[3]
movdqa @MSG0[0],@MSG0[1]
pcmpgtd @MSG0[2],@MSG1[2]
pcmpgtd @MSG0[2],@MSG1[3]
movdqa $ABCD0,$E0
movdqa $ABCD1,$E1
sha1rnds4 \$3,$E0_,$ABCD0 # 76-79
sha1nexte $MSG0[2],$E0
sha1rnds4 \$3,$E1_,$ABCD1 # 76-79
sha1nexte $MSG0[2],$E1
pcmpgtd @MSG0[2],@MSG0[1] # counter mask
pand @MSG1[2],$ABCD0
pand @MSG1[2],$E0
pand @MSG1[3],$ABCD1
pand @MSG1[3],$E1
paddd @MSG0[1],@MSG0[0] # counters--
paddd 0x40(%rsp),$ABCD0
paddd 0x50(%rsp),$E0
paddd 0x60(%rsp),$ABCD1
paddd 0x70(%rsp),$E1
movq @MSG0[0],(%rbx) # save counters
dec $num
jnz .Loop_shaext
mov `$REG_SZ*17+8`(%rsp),$num
pshufd \$0b00011011,$ABCD0,$ABCD0
pshufd \$0b00011011,$ABCD1,$ABCD1
movdqa $ABCD0,@MSG0[0]
punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0
punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0
punpckhdq $E1,$E0 # e1.e0.xx.xx
movq $ABCD0,0x00-0x40($ctx) # a1.a0
psrldq \$8,$ABCD0
movq @MSG0[0],0x40-0x40($ctx)# c1.c0
psrldq \$8,@MSG0[0]
movq $ABCD0,0x20-0x40($ctx) # b1.b0
psrldq \$8,$E0
movq @MSG0[0],0x60-0x40($ctx)# d1.d0
movq $E0,0x80-0x40($ctx) # e1.e0
lea `$REG_SZ/2`($ctx),$ctx
lea `16*2`($inp),$inp
dec $num
jnz .Loop_grande_shaext
.Ldone_shaext:
#mov `$REG_SZ*17`(%rsp),%rax # original %rsp
___
$code
.=<<
___
if
(
$win64
);
movaps
-
0xb8
(
%rax
),
%xmm6
movaps
-
0xa8
(
%rax
),
%xmm7
movaps
-
0x98
(
%rax
),
%xmm8
movaps
-
0x88
(
%rax
),
%xmm9
movaps
-
0x78
(
%rax
),
%xmm10
movaps
-
0x68
(
%rax
),
%xmm11
movaps
-
0x58
(
%rax
),
%xmm12
movaps
-
0x48
(
%rax
),
%xmm13
movaps
-
0x38
(
%rax
),
%xmm14
movaps
-
0x28
(
%rax
),
%xmm15
___
$code
.=
<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue_shaext:
ret
.size sha1_multi_block_shaext,.-sha1_multi_block_shaext
___
}}}
if
(
$avx
)
{{{
sub
BODY_00_19_avx
{
...
...
@@ -752,6 +1011,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody_avx:
lea K_XX_XX(%rip),$Tbl
lea `$REG_SZ*16`(%rsp),%rbx
...
...
@@ -858,6 +1118,7 @@ $code.=<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue_avx:
ret
.size sha1_multi_block_avx,.-sha1_multi_block_avx
___
...
...
@@ -904,6 +1165,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
.Lbody_avx2:
lea K_XX_XX(%rip),$Tbl
shr \$1,$num
...
...
@@ -1015,6 +1277,7 @@ $code.=<<___;
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp
.Lepilogue_avx2:
ret
.size sha1_multi_block_avx2,.-sha1_multi_block_avx2
___
...
...
@@ -1033,17 +1296,261 @@ K_XX_XX:
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
.asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
if
(
$win64
)
{
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
$rec
=
"
%rcx
";
$frame
=
"
%rdx
";
$context
=
"
%r8
";
$disp
=
"
%r9
";
$code
.=
<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # end of prologue label
cmp %r10,%rbx # context->Rip<.Lbody
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lin_prologue
mov `16*17`(%rax),%rax # pull saved stack pointer
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
lea -24-10*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx
.long 0xa548f3fc # cld; rep movsq
.Lin_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler
___
$code
.=<<
___
if
(
$avx
>
1
);
.
type
avx2_handler
,
\
@abi
-
omnipotent
.
align
16
avx2_handler:
push
%rsi
push
%rdi
push
%rbx
push
%rbp
push
%r12
push
%r13
push
%r14
push
%r15
pushfq
sub
\$64,%
rsp
mov
120($
context
),%
rax
#
pull
context
->
Rax
mov
248($
context
),%
rbx
#
pull
context
->
Rip
mov
8($
disp
),%
rsi
#
disp
->
ImageBase
mov
56($
disp
),%
r11
#
disp
->
HandlerData
mov
0(%
r11
),%
r10d
#
HandlerData
[0]
lea
(%
rsi
,%
r10
),%
r10
#
end
of
prologue
label
cmp
%
r10
,%
rbx
#
context
->
Rip
<
body
label
jb
.
Lin_prologue
mov
152($
context
),%
rax
#
pull
context
->
Rsp
mov
4(%
r11
),%
r10d
#
HandlerData
[1]
lea
(%
rsi
,%
r10
),%
r10
#
epilogue
label
cmp
%
r10
,%
rbx
#
context
->
Rip
>=
epilogue
label
jae
.
Lin_prologue
mov
`32*17`($
context
),%
rax
#
pull
saved
stack
pointer
mov
-8(%
rax
),%
rbx
mov
-16(%
rax
),%
rbp
mov
-24(%
rax
),%
r12
mov
-32(%
rax
),%
r13
mov
-40(%
rax
),%
r14
mov
-48(%
rax
),%
r15
mov
%
rbx
,144($
context
)
#
restore
context
->
Rbx
mov
%
rbp
,160($
context
)
#
restore
context
->
Rbp
mov
%
r12
,216($
context
)
#
restore
cotnext
->
R12
mov
%
r13
,224($
context
)
#
restore
cotnext
->
R13
mov
%
r14
,232($
context
)
#
restore
cotnext
->
R14
mov
%
r15
,240($
context
)
#
restore
cotnext
->
R15
lea
-56-10*16(%
rax
),%
rsi
lea
512($
context
),%
rdi
#
&
context
.
Xmm6
mov
\$20,%
ecx
.
long
0
xa548f3fc
#
cld
;
rep
movsq
jmp
.
Lin_prologue
.
size
avx2_handler
,
.-
avx2_handler
___
$code
.=
<<___;
.section .pdata
.align 4
.rva .LSEH_begin_sha1_multi_block
.rva .LSEH_end_sha1_multi_block
.rva .LSEH_info_sha1_multi_block
.rva .LSEH_begin_sha1_multi_block_shaext
.rva .LSEH_end_sha1_multi_block_shaext
.rva .LSEH_info_sha1_multi_block_shaext
___
$code
.=<<
___
if
(
$avx
);
.
rva
.
LSEH_begin_sha1_multi_block_avx
.
rva
.
LSEH_end_sha1_multi_block_avx
.
rva
.
LSEH_info_sha1_multi_block_avx
___
$code
.=<<
___
if
(
$avx
>
1
);
.
rva
.
LSEH_begin_sha1_multi_block_avx2
.
rva
.
LSEH_end_sha1_multi_block_avx2
.
rva
.
LSEH_info_sha1_multi_block_avx2
___
$code
.=
<<___;
.section .xdata
.align 8
.LSEH_info_sha1_multi_block:
.byte 9,0,0,0
.rva se_handler
.rva .Lbody,.Lepilogue # HandlerData[]
.LSEH_info_sha1_multi_block_shaext:
.byte 9,0,0,0
.rva se_handler
.rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
___
$code
.=<<
___
if
(
$avx
);
.
LSEH_info_sha1_multi_block_avx:
.
byte
9
,
0
,
0
,
0
.
rva
se_handler
.
rva
.
Lbody_avx
,
.
Lepilogue_avx
# HandlerData[]
___
$code
.=<<
___
if
(
$avx
>
1
);
.
LSEH_info_sha1_multi_block_avx2:
.
byte
9
,
0
,
0
,
0
.
rva
avx2_handler
.
rva
.
Lbody_avx2
,
.
Lepilogue_avx2
# HandlerData[]
___
}
####################################################################
sub
rex
{
local
*opcode
=
shift
;
my
(
$dst
,
$src
)
=
@_
;
my
$rex
=
0
;
$rex
|=
0x04
if
(
$dst
>=
8
);
$rex
|=
0x01
if
(
$src
>=
8
);
unshift
@opcode
,
$rex
|
0x40
if
(
$rex
);
}
sub
sha1rnds4
{
if
(
@_
[
0
]
=~
/\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/
)
{
my
@opcode
=
(
0x0f
,
0x3a
,
0xcc
);
rex
(
\
@opcode
,
$
3
,
$
2
);
push
@opcode
,
0xc0
|
(
$
2
&
7
)
|
((
$
3
&
7
)
<<
3
);
# ModR/M
my
$c
=
$
1
;
push
@opcode
,
$c
=~
/^0/
?
oct
(
$c
):
$c
;
return
"
.byte
\t
"
.
join
('
,
',
@opcode
);
}
else
{
return
"
sha1rnds4
\t
"
.
@_
[
0
];
}
}
sub
sha1op38
{
my
$instr
=
shift
;
my
%opcodelet
=
(
"
sha1nexte
"
=>
0xc8
,
"
sha1msg1
"
=>
0xc9
,
"
sha1msg2
"
=>
0xca
);
if
(
defined
(
$opcodelet
{
$instr
})
&&
@_
[
0
]
=~
/%xmm([0-9]+),\s*%xmm([0-9]+)/
)
{
my
@opcode
=
(
0x0f
,
0x38
);
rex
(
\
@opcode
,
$
2
,
$
1
);
push
@opcode
,
$opcodelet
{
$instr
};
push
@opcode
,
0xc0
|
(
$
1
&
7
)
|
((
$
2
&
7
)
<<
3
);
# ModR/M
return
"
.byte
\t
"
.
join
('
,
',
@opcode
);
}
else
{
return
$instr
.
"
\t
"
.
@_
[
0
];
}
}
foreach
(
split
("
\n
",
$code
))
{
s/\`([^\`]*)\`/eval($1)/g
e
;
s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/g
eo
or
s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/g
eo
or
s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go
or
s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go
or
s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go
or
s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go
or
s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go
or
s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go
;
print
$_
,"
\n
";
}
...
...
crypto/sha/asm/sha1-x86_64.pl
浏览文件 @
619b9466
...
...
@@ -57,6 +57,10 @@
# hint regarding the number of Xupdate iterations to pre-compute in
# advance was provided by Ilya Albrekht of Intel Corp.
# March 2014.
#
# Add support for Intel SHA Extensions.
######################################################################
# Current performance is summarized in following table. Numbers are
# CPU clock cycles spent to process single byte (less is better).
...
...
@@ -71,7 +75,7 @@
# Haswell 5.45 4.15/+31% 3.57/+53%
# Bulldozer 9.11 5.95/+53%
# VIA Nano 9.32 7.15/+30%
# Atom
[10.5?] [9.23?]/+14
%
# Atom
10.3 9.17/+12
%
# Silvermont 13.1(*) 9.37/+40%
#
# (*) obviously suboptimal result, nothing was done about it,
...
...
@@ -241,6 +245,9 @@ sha1_block_data_order:
mov OPENSSL_ia32cap_P+8(%rip),%r10d
test \$`1<<9`,%r8d # check SSSE3 bit
jz .Lialu
test \$`1<<29`,%r10d # check SHA bit
jnz _shaext_shortcut
___
$code
.=<<
___
if
(
$avx
>
1
);
and
\
$
`
1<<3|1<<5|1<<8
`,
%r10d
# check AVX2+BMI1+BMI2
...
...
@@ -315,6 +322,120 @@ $code.=<<___;
.size sha1_block_data_order,.-sha1_block_data_order
___
{{{
######################################################################
# Intel SHA Extensions implementation of SHA1 update function.
#
my
(
$ctx
,
$inp
,
$num
)
=
("
%rdi
","
%rsi
","
%rdx
");
my
(
$ABCD
,
$E
,
$E_
,
$BSWAP
,
$ABCD_SAVE
,
$E_SAVE
)
=
map
("
%xmm
$_
",(
0
..
3
,
8
,
9
));
my
@MSG
=
map
("
%xmm
$_
",(
4
..
7
));
$code
.=
<<___;
.type sha1_block_data_order_shaext,\@function,3
.align 32
sha1_block_data_order_shaext:
_shaext_shortcut:
___
$code
.=<<
___
if
(
$win64
);
lea
`
-8-4*16
`(
%rsp
),
%rsp
movaps
%xmm6
,
-
8
-
4
*
16
(
%rax
)
movaps
%xmm7
,
-
8
-
3
*
16
(
%rax
)
movaps
%xmm8
,
-
8
-
2
*
16
(
%rax
)
movaps
%xmm9
,
-
8
-
1
*
16
(
%rax
)
.
Lprologue_shaext:
___
$code
.=
<<___;
movdqu ($ctx),$ABCD
movd 16($ctx),$E
movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
movdqu ($inp),@MSG[0]
pshufd \$0b00011011,$ABCD,$ABCD # flip word order
movdqu 0x10($inp),@MSG[1]
pshufd \$0b00011011,$E,$E # flip word order
movdqu 0x20($inp),@MSG[2]
pshufb $BSWAP,@MSG[0]
movdqu 0x30($inp),@MSG[3]
pshufb $BSWAP,@MSG[1]
pshufb $BSWAP,@MSG[2]
movdqa $E,$E_SAVE # offload $E
pshufb $BSWAP,@MSG[3]
jmp .Loop_shaext
.align 16
.Loop_shaext:
dec $num
lea 0x40($inp),%rax # next input block
paddd @MSG[0],$E
cmovne %rax,$inp
movdqa $ABCD,$ABCD_SAVE # offload $ABCD
___
for
(
$i
=
0
;
$i
<
20
-
4
;
$i
+=
2
)
{
$code
.=
<<___;
sha1msg1 @MSG[1],@MSG[0]
movdqa $ABCD,$E_
sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
sha1nexte @MSG[1],$E_
pxor @MSG[2],@MSG[0]
sha1msg1 @MSG[2],@MSG[1]
sha1msg2 @MSG[3],@MSG[0]
movdqa $ABCD,$E
sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
sha1nexte @MSG[2],$E
pxor @MSG[3],@MSG[1]
sha1msg2 @MSG[0],@MSG[1]
___
push
(
@MSG
,
shift
(
@MSG
));
push
(
@MSG
,
shift
(
@MSG
));
}
$code
.=
<<___;
movdqu ($inp),@MSG[0]
movdqa $ABCD,$E_
sha1rnds4 \$3,$E,$ABCD # 64-67
sha1nexte @MSG[1],$E_
movdqu 0x10($inp),@MSG[1]
pshufb $BSWAP,@MSG[0]
movdqa $ABCD,$E
sha1rnds4 \$3,$E_,$ABCD # 68-71
sha1nexte @MSG[2],$E
movdqu 0x20($inp),@MSG[2]
pshufb $BSWAP,@MSG[1]
movdqa $ABCD,$E_
sha1rnds4 \$3,$E,$ABCD # 72-75
sha1nexte @MSG[3],$E_
movdqu 0x30($inp),@MSG[3]
pshufb $BSWAP,@MSG[2]
movdqa $ABCD,$E
sha1rnds4 \$3,$E_,$ABCD # 76-79
sha1nexte $E_SAVE,$E
pshufb $BSWAP,@MSG[3]
paddd $ABCD_SAVE,$ABCD
movdqa $E,$E_SAVE # offload $E
jnz .Loop_shaext
pshufd \$0b00011011,$ABCD,$ABCD
pshufd \$0b00011011,$E,$E
movdqu $ABCD,($ctx)
movd $E,16($ctx)
___
$code
.=<<
___
if
(
$win64
);
movaps
-
8
-
4
*
16
(
%rax
),
%xmm6
movaps
-
8
-
3
*
16
(
%rax
),
%xmm7
movaps
-
8
-
2
*
16
(
%rax
),
%xmm8
movaps
-
8
-
1
*
16
(
%rax
),
%xmm9
mov
%rax
,
%rsp
.
Lepilogue_shaext:
___
$code
.=
<<___;
ret
.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
___
}}}
{{{
my
$Xi
=
4
;
my
@X
=
map
("
%xmm
$_
",(
4
..
7
,
0
..
3
));
my
@Tx
=
map
("
%xmm
$_
",(
8
..
10
));
...
...
@@ -1646,6 +1767,7 @@ K_XX_XX:
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
___
}}}
$code
.=
<<___;
...
...
@@ -1706,6 +1828,39 @@ se_handler:
jmp .Lcommon_seh_tail
.size se_handler,.-se_handler
.type shaext_handler,\@abi-omnipotent
.align 16
shaext_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
lea .Lprologue_shaext(%rip),%r10
cmp %r10,%rbx # context->Rip<.Lprologue
jb .Lcommon_seh_tail
lea .Lepilogue_shaext(%rip),%r10
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lcommon_seh_tail
lea -8-4*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$8,%ecx
.long 0xa548f3fc # cld; rep movsq
jmp .Lcommon_seh_tail
.size shaext_handler,.-shaext_handler
.type ssse3_handler,\@abi-omnipotent
.align 16
ssse3_handler:
...
...
@@ -1801,6 +1956,9 @@ ssse3_handler:
.rva .LSEH_begin_sha1_block_data_order
.rva .LSEH_end_sha1_block_data_order
.rva .LSEH_info_sha1_block_data_order
.rva .LSEH_begin_sha1_block_data_order_shaext
.rva .LSEH_end_sha1_block_data_order_shaext
.rva .LSEH_info_sha1_block_data_order_shaext
.rva .LSEH_begin_sha1_block_data_order_ssse3
.rva .LSEH_end_sha1_block_data_order_ssse3
.rva .LSEH_info_sha1_block_data_order_ssse3
...
...
@@ -1821,6 +1979,9 @@ $code.=<<___;
.LSEH_info_sha1_block_data_order:
.byte 9,0,0,0
.rva se_handler
.LSEH_info_sha1_block_data_order_shaext:
.byte 9,0,0,0
.rva shaext_handler
.LSEH_info_sha1_block_data_order_ssse3:
.byte 9,0,0,0
.rva ssse3_handler
...
...
@@ -1842,6 +2003,41 @@ ___
####################################################################
$code
=~
s/\`([^\`]*)\`/eval $1/g
em
;
print
$code
;
sub
sha1rnds4
{
if
(
@_
[
0
]
=~
/\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/
)
{
my
@opcode
=
(
0x0f
,
0x3a
,
0xcc
);
push
@opcode
,
0xc0
|
(
$
2
&
7
)
|
((
$
3
&
7
)
<<
3
);
# ModR/M
my
$c
=
$
1
;
push
@opcode
,
$c
=~
/^0/
?
oct
(
$c
):
$c
;
return
"
.byte
\t
"
.
join
('
,
',
@opcode
);
}
else
{
return
"
sha1rnds4
\t
"
.
@_
[
0
];
}
}
sub
sha1op38
{
my
$instr
=
shift
;
my
%opcodelet
=
(
"
sha1nexte
"
=>
0xc8
,
"
sha1msg1
"
=>
0xc9
,
"
sha1msg2
"
=>
0xca
);
if
(
defined
(
$opcodelet
{
$instr
})
&&
@_
[
0
]
=~
/%xmm([0-7]),\s*%xmm([0-7])/
)
{
my
@opcode
=
(
0x0f
,
0x38
);
push
@opcode
,
$opcodelet
{
$instr
};
push
@opcode
,
0xc0
|
(
$
1
&
7
)
|
((
$
2
&
7
)
<<
3
);
# ModR/M
return
"
.byte
\t
"
.
join
('
,
',
@opcode
);
}
else
{
return
$instr
.
"
\t
"
.
@_
[
0
];
}
}
foreach
(
split
("
\n
",
$code
))
{
s/\`([^\`]*)\`/eval $1/g
eo
;
s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/g
eo
or
s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/g
eo
;
print
$_
,"
\n
";
}
close
STDOUT
;
crypto/sha/asm/sha256-586.pl
浏览文件 @
619b9466
...
...
@@ -34,6 +34,10 @@
# (Biggest improvement coefficient is on upcoming Atom Silvermont,
# not shown.) Add AVX+BMI code path.
#
# March 2014.
#
# Add support for Intel SHA Extensions.
#
# Performance in clock cycles per processed byte (less is better):
#
# gcc icc x86 asm(*) SIMD x86_64 asm(**)
...
...
@@ -196,8 +200,13 @@ sub BODY_00_15() {
&mov
("
ebx
",
&DWP
(
4
,"
edx
"));
&test
("
ecx
",
1
<<
20
);
# check for P4
&jnz
(
&label
("
loop
"));
&mov
("
edx
",
&DWP
(
8
,"
edx
"))
if
(
$xmm
);
&test
("
ecx
",
1
<<
24
);
# check for FXSR
&jz
(
$unroll_after
?
&label
("
no_xmm
"):
&label
("
loop
"));
&and
("
ecx
",
1
<<
30
);
# mask "Intel CPU" bit
&and
("
ebx
",
1
<<
28
|
1
<<
9
);
# mask AVX and SSSE3 bits
&test
("
edx
",
1
<<
29
)
if
(
$xmm
);
# check for SHA
&jnz
(
&label
("
shaext
"))
if
(
$xmm
);
&or
("
ecx
","
ebx
");
&and
("
ecx
",
1
<<
28
|
1
<<
30
);
&cmp
("
ecx
",
1
<<
28
|
1
<<
30
);
...
...
@@ -209,6 +218,7 @@ sub BODY_00_15() {
&je
(
&label
("
loop_shrd
"));
}
if
(
$unroll_after
)
{
&set_label
("
no_xmm
");
&sub
("
eax
","
edi
");
&cmp
("
eax
",
$unroll_after
);
&jae
(
&label
("
unrolled
"));
...
...
@@ -495,6 +505,146 @@ my @AH=($A,$K256);
&function_end_A
();
}
if
(
!
$i386
&&
$xmm
)
{{{
{
######################################################################
# Intel SHA Extensions implementation of SHA256 update function.
#
my
(
$ctx
,
$inp
,
$end
)
=
("
esi
","
edi
","
eax
");
my
(
$Wi
,
$ABEF
,
$CDGH
,
$TMP
)
=
map
("
xmm
$_
",(
0
..
2
,
7
));
my
@MSG
=
map
("
xmm
$_
",(
3
..
6
));
sub
sha256op38
{
my
(
$opcodelet
,
$dst
,
$src
)
=
@_
;
if
("
$dst
:
$src
"
=~
/xmm([0-7]):xmm([0-7])/
)
{
&data_byte
(
0x0f
,
0x38
,
$opcodelet
,
0xc0
|
(
$
1
<<
3
)
|
$
2
);
}
}
sub
sha256rnds2
{
sha256op38
(
0xcb
,
@
_
);
}
sub
sha256msg1
{
sha256op38
(
0xcc
,
@
_
);
}
sub
sha256msg2
{
sha256op38
(
0xcd
,
@
_
);
}
&set_label
("
shaext
",
32
);
&sub
("
esp
",
32
);
&movdqu
(
$ABEF
,
&QWP
(
0
,
$ctx
));
# DCBA
&lea
(
$K256
,
&DWP
(
0x80
,
$K256
));
&movdqu
(
$CDGH
,
&QWP
(
16
,
$ctx
));
# HGFE
&movdqa
(
$TMP
,
&QWP
(
0x100
-
0x80
,
$K256
));
# byte swap mask
&pshufd
(
$Wi
,
$ABEF
,
0x1b
);
# ABCD
&pshufd
(
$ABEF
,
$ABEF
,
0xb1
);
# CDAB
&pshufd
(
$CDGH
,
$CDGH
,
0x1b
);
# EFGH
&palignr
(
$ABEF
,
$CDGH
,
8
);
# ABEF
&punpcklqdq
(
$CDGH
,
$Wi
);
# CDGH
&jmp
(
&label
("
loop_shaext
"));
&set_label
("
loop_shaext
",
16
);
&movdqu
(
@MSG
[
0
],
&QWP
(
0
,
$inp
));
&movdqu
(
@MSG
[
1
],
&QWP
(
0x10
,
$inp
));
&movdqu
(
@MSG
[
2
],
&QWP
(
0x20
,
$inp
));
&pshufb
(
@MSG
[
0
],
$TMP
);
&movdqu
(
@MSG
[
3
],
&QWP
(
0x30
,
$inp
));
&movdqa
(
&QWP
(
16
,"
esp
"),
$CDGH
);
# offload
&movdqa
(
$Wi
,
&QWP
(
0
*
16
-
0x80
,
$K256
));
&paddd
(
$Wi
,
@MSG
[
0
]);
&pshufb
(
@MSG
[
1
],
$TMP
);
&sha256rnds2
(
$CDGH
,
$ABEF
);
# 0-3
&pshufd
(
$Wi
,
$Wi
,
0x0e
);
&nop
();
&movdqa
(
&QWP
(
0
,"
esp
"),
$ABEF
);
# offload
&sha256rnds2
(
$ABEF
,
$CDGH
);
&movdqa
(
$Wi
,
&QWP
(
1
*
16
-
0x80
,
$K256
));
&paddd
(
$Wi
,
@MSG
[
1
]);
&pshufb
(
@MSG
[
2
],
$TMP
);
&sha256rnds2
(
$CDGH
,
$ABEF
);
# 4-7
&pshufd
(
$Wi
,
$Wi
,
0x0e
);
&lea
(
$inp
,
&DWP
(
0x40
,
$inp
));
&sha256msg1
(
@MSG
[
0
],
@MSG
[
1
]);
&sha256rnds2
(
$ABEF
,
$CDGH
);
&movdqa
(
$Wi
,
&QWP
(
2
*
16
-
0x80
,
$K256
));
&paddd
(
$Wi
,
@MSG
[
2
]);
&pshufb
(
@MSG
[
3
],
$TMP
);
&sha256rnds2
(
$CDGH
,
$ABEF
);
# 8-11
&pshufd
(
$Wi
,
$Wi
,
0x0e
);
&movdqa
(
$TMP
,
@MSG
[
3
]);
&palignr
(
$TMP
,
@MSG
[
2
],
4
);
&nop
();
&paddd
(
@MSG
[
0
],
$TMP
);
&sha256msg1
(
@MSG
[
1
],
@MSG
[
2
]);
&sha256rnds2
(
$ABEF
,
$CDGH
);
&movdqa
(
$Wi
,
&QWP
(
3
*
16
-
0x80
,
$K256
));
&paddd
(
$Wi
,
@MSG
[
3
]);
&sha256msg2
(
@MSG
[
0
],
@MSG
[
3
]);
&sha256rnds2
(
$CDGH
,
$ABEF
);
# 12-15
&pshufd
(
$Wi
,
$Wi
,
0x0e
);
&movdqa
(
$TMP
,
@MSG
[
0
]);
&palignr
(
$TMP
,
@MSG
[
3
],
4
);
&nop
();
&paddd
(
@MSG
[
1
],
$TMP
);
&sha256msg1
(
@MSG
[
2
],
@MSG
[
3
]);
&sha256rnds2
(
$ABEF
,
$CDGH
);
for
(
$i
=
4
;
$i
<
16
-
3
;
$i
++
)
{
&movdqa
(
$Wi
,
&QWP
(
$i
*
16
-
0x80
,
$K256
));
&paddd
(
$Wi
,
@MSG
[
0
]);
&sha256msg2
(
@MSG
[
1
],
@MSG
[
0
]);
&sha256rnds2
(
$CDGH
,
$ABEF
);
# 16-19...
&pshufd
(
$Wi
,
$Wi
,
0x0e
);
&movdqa
(
$TMP
,
@MSG
[
1
]);
&palignr
(
$TMP
,
@MSG
[
0
],
4
);
&nop
();
&paddd
(
@MSG
[
2
],
$TMP
);
&sha256msg1
(
@MSG
[
3
],
@MSG
[
0
]);
&sha256rnds2
(
$ABEF
,
$CDGH
);
push
(
@MSG
,
shift
(
@MSG
));
}
&movdqa
(
$Wi
,
&QWP
(
13
*
16
-
0x80
,
$K256
));
&paddd
(
$Wi
,
@MSG
[
0
]);
&sha256msg2
(
@MSG
[
1
],
@MSG
[
0
]);
&sha256rnds2
(
$CDGH
,
$ABEF
);
# 52-55
&pshufd
(
$Wi
,
$Wi
,
0x0e
);
&movdqa
(
$TMP
,
@MSG
[
1
])
&palignr
(
$TMP
,
@MSG
[
0
],
4
);
&sha256rnds2
(
$ABEF
,
$CDGH
);
&paddd
(
@MSG
[
2
],
$TMP
);
&movdqa
(
$Wi
,
&QWP
(
14
*
16
-
0x80
,
$K256
));
&paddd
(
$Wi
,
@MSG
[
1
]);
&sha256rnds2
(
$CDGH
,
$ABEF
);
# 56-59
&pshufd
(
$Wi
,
$Wi
,
0x0e
);
&sha256msg2
(
@MSG
[
2
],
@MSG
[
1
]);
&movdqa
(
$TMP
,
&QWP
(
0x100
-
0x80
,
$K256
));
# byte swap mask
&sha256rnds2
(
$ABEF
,
$CDGH
);
&movdqa
(
$Wi
,
&QWP
(
15
*
16
-
0x80
,
$K256
));
&paddd
(
$Wi
,
@MSG
[
2
]);
&nop
();
&sha256rnds2
(
$CDGH
,
$ABEF
);
# 60-63
&pshufd
(
$Wi
,
$Wi
,
0x0e
);
&cmp
(
$end
,
$inp
);
&nop
();
&sha256rnds2
(
$ABEF
,
$CDGH
);
&paddd
(
$CDGH
,
&QWP
(
16
,"
esp
"));
&paddd
(
$ABEF
,
&QWP
(
0
,"
esp
"));
&jnz
(
&label
("
loop_shaext
"));
&pshufd
(
$CDGH
,
$CDGH
,
0xb1
);
# DCHG
&pshufd
(
$TMP
,
$ABEF
,
0x1b
);
# FEBA
&pshufd
(
$ABEF
,
$ABEF
,
0xb1
);
# BAFE
&punpckhqdq
(
$ABEF
,
$CDGH
);
# DCBA
&palignr
(
$CDGH
,
$TMP
,
8
);
# HGFE
&mov
("
esp
",
&DWP
(
32
+
12
,"
esp
"));
&movdqu
(
&QWP
(
0
,
$ctx
),
$ABEF
);
&movdqu
(
&QWP
(
16
,
$ctx
),
$CDGH
);
&function_end_A
();
}
my
@X
=
map
("
xmm
$_
",(
0
..
3
));
my
(
$t0
,
$t1
,
$t2
,
$t3
)
=
map
("
xmm
$_
",(
4
..
7
));
my
@AH
=
(
$A
,
$T
);
...
...
@@ -811,7 +961,6 @@ sub body_00_15 () {
if
(
$avx
)
{
&set_label
("
AVX
",
32
);
if
(
$avx
>
1
)
{
&mov
("
edx
",
&DWP
(
8
,"
edx
"));
&and
("
edx
",
1
<<
8
|
1
<<
3
);
# check for BMI2+BMI1
&cmp
("
edx
",
1
<<
8
|
1
<<
3
);
&je
(
&label
("
AVX_BMI
"));
...
...
crypto/sha/asm/sha256-mb-x86_64.pl
浏览文件 @
619b9466
此差异已折叠。
点击以展开。
crypto/sha/asm/sha512-x86_64.pl
浏览文件 @
619b9466
...
...
@@ -67,7 +67,12 @@
# significant 128-bit halves and data from second to most significant.
# The data is then processed with same SIMD instruction sequence as
# for AVX, but with %ymm as operands. Side effect is increased stack
# frame, 448 additional bytes in SHA256 and 1152 in SHA512.
# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
# code size increase.
#
# March 2014.
#
# Add support for Intel SHA Extensions.
######################################################################
# Current performance in cycles per processed byte (less is better):
...
...
@@ -254,6 +259,10 @@ $code.=<<___ if ($SZ==4 || $avx);
mov
4
(
%r11
),
%r10d
mov
8
(
%r11
),
%r11d
___
$code
.=<<
___
if
(
$SZ
==
4
);
test
\
$
`
1<<29
`,
%r11d
# check for SHA
jnz
_shaext_shortcut
___
$code
.=<<
___
if
(
$avx
&&
$SZ
==
8
);
test
\
$
`
1<<11
`,
%r10d
# check for XOP
jnz
.
Lxop_shortcut
...
...
@@ -509,6 +518,166 @@ ___
######################################################################
# SIMD code paths
#
if
(
$SZ
==
4
)
{{{
######################################################################
# Intel SHA Extensions implementation of SHA256 update function.
#
my
(
$ctx
,
$inp
,
$num
,
$Tbl
)
=
("
%rdi
","
%rsi
","
%rdx
","
%rcx
");
my
(
$Wi
,
$ABEF
,
$CDGH
,
$TMP
,
$BSWAP
,
$ABEF_SAVE
,
$CDGH_SAVE
)
=
map
("
%xmm
$_
",(
0
..
2
,
7
..
10
));
my
@MSG
=
map
("
%xmm
$_
",(
3
..
6
));
$code
.=
<<___;
.type sha256_block_data_order_shaext,\@function,3
.align 64
sha256_block_data_order_shaext:
_shaext_shortcut:
___
$code
.=<<
___
if
(
$win64
);
lea
`
-8-5*16
`(
%rsp
),
%rsp
movaps
%xmm6
,
-
8
-
5
*
16
(
%rax
)
movaps
%xmm7
,
-
8
-
4
*
16
(
%rax
)
movaps
%xmm8
,
-
8
-
3
*
16
(
%rax
)
movaps
%xmm9
,
-
8
-
2
*
16
(
%rax
)
movaps
%xmm10
,
-
8
-
1
*
16
(
%rax
)
.
Lprologue_shaext:
___
$code
.=
<<___;
lea K256+0x80(%rip),$Tbl
movdqu ($ctx),$ABEF # DCBA
movdqu 16($ctx),$CDGH # HGFE
movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
pshufd \$0x1b,$ABEF,$Wi # ABCD
pshufd \$0xb1,$ABEF,$ABEF # CDAB
pshufd \$0x1b,$CDGH,$CDGH # EFGH
movdqa $TMP,$BSWAP # offload
palignr \$8,$CDGH,$ABEF # ABEF
punpcklqdq $Wi,$CDGH # CDGH
jmp .Loop_shaext
.align 16
.Loop_shaext:
movdqu ($inp),@MSG[0]
movdqu 0x10($inp),@MSG[1]
movdqu 0x20($inp),@MSG[2]
pshufb $TMP,@MSG[0]
movdqu 0x30($inp),@MSG[3]
movdqa 0*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
pshufb $TMP,@MSG[1]
movdqa $CDGH,$CDGH_SAVE # offload
sha256rnds2 $ABEF,$CDGH # 0-3
pshufd \$0x0e,$Wi,$Wi
nop
movdqa $ABEF,$ABEF_SAVE # offload
sha256rnds2 $CDGH,$ABEF
movdqa 1*32-0x80($Tbl),$Wi
paddd @MSG[1],$Wi
pshufb $TMP,@MSG[2]
sha256rnds2 $ABEF,$CDGH # 4-7
pshufd \$0x0e,$Wi,$Wi
lea 0x40($inp),$inp
sha256msg1 @MSG[1],@MSG[0]
sha256rnds2 $CDGH,$ABEF
movdqa 2*32-0x80($Tbl),$Wi
paddd @MSG[2],$Wi
pshufb $TMP,@MSG[3]
sha256rnds2 $ABEF,$CDGH # 8-11
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[3],$TMP
palignr \$4,@MSG[2],$TMP
nop
paddd $TMP,@MSG[0]
sha256msg1 @MSG[2],@MSG[1]
sha256rnds2 $CDGH,$ABEF
movdqa 3*32-0x80($Tbl),$Wi
paddd @MSG[3],$Wi
sha256msg2 @MSG[3],@MSG[0]
sha256rnds2 $ABEF,$CDGH # 12-15
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[0],$TMP
palignr \$4,@MSG[3],$TMP
nop
paddd $TMP,@MSG[1]
sha256msg1 @MSG[3],@MSG[2]
sha256rnds2 $CDGH,$ABEF
___
for
(
$i
=
4
;
$i
<
16
-
3
;
$i
++
)
{
$code
.=
<<___;
movdqa $i*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
sha256msg2 @MSG[0],@MSG[1]
sha256rnds2 $ABEF,$CDGH # 16-19...
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[1],$TMP
palignr \$4,@MSG[0],$TMP
nop
paddd $TMP,@MSG[2]
sha256msg1 @MSG[0],@MSG[3]
sha256rnds2 $CDGH,$ABEF
___
push
(
@MSG
,
shift
(
@MSG
));
}
$code
.=
<<___;
movdqa 13*32-0x80($Tbl),$Wi
paddd @MSG[0],$Wi
sha256msg2 @MSG[0],@MSG[1]
sha256rnds2 $ABEF,$CDGH # 52-55
pshufd \$0x0e,$Wi,$Wi
movdqa @MSG[1],$TMP
palignr \$4,@MSG[0],$TMP
sha256rnds2 $CDGH,$ABEF
paddd $TMP,@MSG[2]
movdqa 14*32-0x80($Tbl),$Wi
paddd @MSG[1],$Wi
sha256rnds2 $ABEF,$CDGH # 56-59
pshufd \$0x0e,$Wi,$Wi
sha256msg2 @MSG[1],@MSG[2]
movdqa $BSWAP,$TMP
sha256rnds2 $CDGH,$ABEF
movdqa 15*32-0x80($Tbl),$Wi
paddd @MSG[2],$Wi
nop
sha256rnds2 $ABEF,$CDGH # 60-63
pshufd \$0x0e,$Wi,$Wi
dec $num
nop
sha256rnds2 $CDGH,$ABEF
paddd $CDGH_SAVE,$CDGH
paddd $ABEF_SAVE,$ABEF
jnz .Loop_shaext
pshufd \$0xb1,$CDGH,$CDGH # DCHG
pshufd \$0x1b,$ABEF,$TMP # FEBA
pshufd \$0xb1,$ABEF,$ABEF # BAFE
punpckhqdq $CDGH,$ABEF # DCBA
palignr \$8,$TMP,$CDGH # HGFE
movdqu $ABEF,($ctx)
movdqu $CDGH,16($ctx)
___
$code
.=<<
___
if
(
$win64
);
movaps
-
8
-
5
*
16
(
%rax
),
%xmm6
movaps
-
8
-
4
*
16
(
%rax
),
%xmm7
movaps
-
8
-
3
*
16
(
%rax
),
%xmm8
movaps
-
8
-
2
*
16
(
%rax
),
%xmm9
movaps
-
8
-
1
*
16
(
%rax
),
%xmm10
mov
%rax
,
%rsp
.
Lepilogue_shaext:
___
$code
.=
<<___;
ret
.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
___
}}}
{{{
my
$a4
=
$T1
;
...
...
@@ -620,13 +789,13 @@ $code.=<<___;
movdqu 0x00($inp),@X[0]
movdqu 0x10($inp),@X[1]
movdqu 0x20($inp),@X[2]
movdqu 0x30($inp),@X[3]
pshufb $t3,@X[0]
movdqu 0x30($inp),@X[3]
lea $TABLE(%rip),$Tbl
pshufb $t3,@X[1]
movdqa 0x00($Tbl),$t0
pshufb $t3,@X[2]
movdqa 0x20($Tbl),$t1
pshufb $t3,@X[2]
paddd @X[0],$t0
movdqa 0x40($Tbl),$t2
pshufb $t3,@X[3]
...
...
@@ -2087,6 +2256,39 @@ $code.=<<___;
ret
.size se_handler,.-se_handler
.type shaext_handler,\@abi-omnipotent
.align 16
shaext_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
lea .Lprologue_shaext(%rip),%r10
cmp %r10,%rbx # context->Rip<.Lprologue
jb .Lin_prologue
lea .Lepilogue_shaext(%rip),%r10
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lin_prologue
lea -8-5*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$10,%ecx
.long 0xa548f3fc # cld; rep movsq
jmp .Lin_prologue
.size shaext_handler,.-shaext_handler
.section .pdata
.align 4
.rva .LSEH_begin_$func
...
...
@@ -2094,6 +2296,9 @@ $code.=<<___;
.rva .LSEH_info_$func
___
$code
.=<<
___
if
(
$SZ
==
4
);
.
rva
.
LSEH_begin_$
{
func
}
_shaext
.
rva
.
LSEH_end_$
{
func
}
_shaext
.
rva
.
LSEH_info_$
{
func
}
_shaext
.
rva
.
LSEH_begin_$
{
func
}
_ssse3
.
rva
.
LSEH_end_$
{
func
}
_ssse3
.
rva
.
LSEH_info_$
{
func
}
_ssse3
...
...
@@ -2122,6 +2327,9 @@ $code.=<<___;
.rva .Lprologue,.Lepilogue # HandlerData[]
___
$code
.=<<
___
if
(
$SZ
==
4
);
.
LSEH_info_$
{
func
}
_shaext:
.
byte
9
,
0
,
0
,
0
.
rva
shaext_handler
.
LSEH_info_$
{
func
}
_ssse3:
.
byte
9
,
0
,
0
,
0
.
rva
se_handler
...
...
@@ -2147,6 +2355,28 @@ $code.=<<___ if ($avx>1);
___
}
$code
=~
s/\`([^\`]*)\`/eval $1/g
em
;
print
$code
;
sub
sha256op38
{
my
$instr
=
shift
;
my
%opcodelet
=
(
"
sha256rnds2
"
=>
0xcb
,
"
sha256msg1
"
=>
0xcc
,
"
sha256msg2
"
=>
0xcd
);
if
(
defined
(
$opcodelet
{
$instr
})
&&
@_
[
0
]
=~
/%xmm([0-7]),\s*%xmm([0-7])/
)
{
my
@opcode
=
(
0x0f
,
0x38
);
push
@opcode
,
$opcodelet
{
$instr
};
push
@opcode
,
0xc0
|
(
$
1
&
7
)
|
((
$
2
&
7
)
<<
3
);
# ModR/M
return
"
.byte
\t
"
.
join
('
,
',
@opcode
);
}
else
{
return
$instr
.
"
\t
"
.
@_
[
0
];
}
}
foreach
(
split
("
\n
",
$code
))
{
s/\`([^\`]*)\`/eval $1/g
eo
;
s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/g
eo
;
print
$_
,"
\n
";
}
close
STDOUT
;
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录