Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
9250a306
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
1 年多 前同步成功
通知
10
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9250a306
编写于
5月 04, 2014
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
sha/asm/sha1-armv4-large.pl: add NEON and ARMv8 code paths.
sha/asm/sha256-armv4.pl: add ARMv8 code path.
上级
4afa9f03
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
550 addition
and
13 deletion
+550
-13
crypto/sha/asm/sha1-armv4-large.pl
crypto/sha/asm/sha1-armv4-large.pl
+432
-8
crypto/sha/asm/sha256-armv4.pl
crypto/sha/asm/sha256-armv4.pl
+118
-5
未找到文件。
crypto/sha/asm/sha1-armv4-large.pl
浏览文件 @
9250a306
#!/usr/bin/env perl
#!/usr/bin/env perl
# ====================================================================
# ====================================================================
# Written by Andy Polyakov <appro@
fy.chalmers.se
> for the OpenSSL
# Written by Andy Polyakov <appro@
openssl.org
> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# details see http://www.openssl.org/~appro/cryptogams/.
...
@@ -52,6 +52,20 @@
...
@@ -52,6 +52,20 @@
# Profiler-assisted and platform-specific optimization resulted in 10%
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
# improvement on Cortex A8 core and 12.2 cycles per byte.
# September 2013.
#
# Add NEON implementation (see sha1-586.pl for background info). On
# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
# faster than integer-only code. Because [fully unrolled] NEON code
# is ~2.5x larger and there are some redundant instructions executed
# when processing last block, improvement is not as big for smallest
# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
# byte, which is also >80% faster than integer-only code.
# May 2014.
#
# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
while
((
$output
=
shift
)
&&
(
$output
!~
/^\w[\w\-]*\.\w+$/
))
{}
while
((
$output
=
shift
)
&&
(
$output
!~
/^\w[\w\-]*\.\w+$/
))
{}
open
STDOUT
,"
>
$output
";
open
STDOUT
,"
>
$output
";
...
@@ -153,12 +167,22 @@ $code=<<___;
...
@@ -153,12 +167,22 @@ $code=<<___;
#include "arm_arch.h"
#include "arm_arch.h"
.text
.text
.code 32
.global sha1_block_data_order
.global sha1_block_data_order
.type sha1_block_data_order,%function
.type sha1_block_data_order,%function
.align
2
.align
5
sha1_block_data_order:
sha1_block_data_order:
#if __ARM_ARCH__>=7
sub r3,pc,#8 @ sha1_block_data_order
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#8
bne .LARMv8
tst r12,#1
bne .LNEON
#endif
stmdb sp!,{r4-r12,lr}
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
ldmia $ctx,{$a,$b,$c,$d,$e}
ldmia $ctx,{$a,$b,$c,$d,$e}
...
@@ -233,16 +257,416 @@ $code.=<<___;
...
@@ -233,16 +257,416 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
bx lr @ interoperable with Thumb ISA:-)
#endif
#endif
.align 2
.size sha1_block_data_order,.-sha1_block_data_order
.align 5
.LK_00_19: .word 0x5a827999
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
.LK_60_79: .word 0xca62c1d6
.size sha1_block_data_order,.-sha1_block_data_order
.LOPENSSL_armcap:
.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.word OPENSSL_armcap_P-sha1_block_data_order
.align 2
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
#####################################################################
# NEON stuff
#
{{{
my
@V
=
(
$a
,
$b
,
$c
,
$d
,
$e
);
my
(
$K_XX_XX
,
$Ki
,
$t0
,
$t1
,
$Xfer
,
$saved_sp
)
=
map
("
r
$_
",(
8
..
12
,
14
));
my
$Xi
=
4
;
my
@X
=
map
("
q
$_
",(
8
..
11
,
0
..
3
));
my
@Tx
=
("
q12
","
q13
");
my
(
$K
,
$zero
)
=
("
q14
","
q15
");
my
$j
=
0
;
sub
AUTOLOAD
()
#
thunk
[
simplified
]
x86
-
style
perlasm
{
my
$opcode
=
$AUTOLOAD
;
$opcode
=~
s/.*:://
;
$opcode
=~
s/_/\./
;
my
$arg
=
pop
;
$arg
=
"
#
$arg
"
if
(
$arg
*
1
eq
$arg
);
$code
.=
"
\t
$opcode
\t
"
.
join
('
,
',
@
_
,
$arg
)
.
"
\n
";
}
sub
body_00_19
()
{
(
'
($a,$b,$c,$d,$e)=@V;
'
.
# '$code.="@ $j\n";'.
'
&bic ($t0,$d,$b)
',
'
&add ($e,$e,$Ki)
',
# e+=X[i]+K
'
&and ($t1,$c,$b)
',
'
&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))
',
'
&add ($e,$e,$a,"ror#27")
',
# e+=ROR(A,27)
'
&eor ($t1,$t1,$t0)
',
# F_00_19
'
&mov ($b,$b,"ror#2")
',
# b=ROR(b,2)
'
&add ($e,$e,$t1);
'
.
# e+=F_00_19
'
$j++; unshift(@V,pop(@V));
'
)
}
sub
body_20_39
()
{
(
'
($a,$b,$c,$d,$e)=@V;
'
.
# '$code.="@ $j\n";'.
'
&eor ($t0,$b,$d)
',
'
&add ($e,$e,$Ki)
',
# e+=X[i]+K
'
&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)
',
'
&eor ($t1,$t0,$c)
',
# F_20_39
'
&add ($e,$e,$a,"ror#27")
',
# e+=ROR(A,27)
'
&mov ($b,$b,"ror#2")
',
# b=ROR(b,2)
'
&add ($e,$e,$t1);
'
.
# e+=F_20_39
'
$j++; unshift(@V,pop(@V));
'
)
}
sub
body_40_59
()
{
(
'
($a,$b,$c,$d,$e)=@V;
'
.
# '$code.="@ $j\n";'.
'
&add ($e,$e,$Ki)
',
# e+=X[i]+K
'
&and ($t0,$c,$d)
',
'
&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))
',
'
&add ($e,$e,$a,"ror#27")
',
# e+=ROR(A,27)
'
&eor ($t1,$c,$d)
',
'
&add ($e,$e,$t0)
',
'
&and ($t1,$t1,$b)
',
'
&mov ($b,$b,"ror#2")
',
# b=ROR(b,2)
'
&add ($e,$e,$t1);
'
.
# e+=F_40_59
'
$j++; unshift(@V,pop(@V));
'
)
}
sub
Xupdate_16_31
()
{
use
integer
;
my
$body
=
shift
;
my
@insns
=
(
&$body
,
&$body
,
&$body
,
&$body
);
my
(
$a
,
$b
,
$c
,
$d
,
$e
);
&vext_8
(
@X
[
0
],
@X
[
-
4
&
7
],
@X
[
-
3
&
7
],
8
);
# compose "X[-14]" in "X[0]"
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vadd_i32
(
@Tx
[
1
],
@X
[
-
1
&
7
],
$K
);
eval
(
shift
(
@insns
));
&vld1_32
("
{
$K
\
[]}
","
[
$K_XX_XX
,:32]!
")
if
(
$Xi
%
5
==
0
);
eval
(
shift
(
@insns
));
&vext_8
(
@Tx
[
0
],
@X
[
-
1
&
7
],
$zero
,
4
);
# "X[-3]", 3 words
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&veor
(
@X
[
0
],
@X
[
0
],
@X
[
-
4
&
7
]);
# "X[0]"^="X[-16]"
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&veor
(
@Tx
[
0
],
@Tx
[
0
],
@X
[
-
2
&
7
]);
# "X[-3]"^"X[-8]"
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&veor
(
@Tx
[
0
],
@Tx
[
0
],
@X
[
0
]);
# "X[0]"^="X[-3]"^"X[-8]
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vst1_32
("
{
@Tx
[1]}
","
[
$Xfer
,:128]!
");
# X[]+K xfer
&sub
(
$Xfer
,
$Xfer
,
64
)
if
(
$Xi
%
4
==
0
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vext_8
(
@Tx
[
1
],
$zero
,
@Tx
[
0
],
4
);
# "X[0]"<<96, extract one dword
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vadd_i32
(
@X
[
0
],
@Tx
[
0
],
@Tx
[
0
]);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vsri_32
(
@X
[
0
],
@Tx
[
0
],
31
);
# "X[0]"<<<=1
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vshr_u32
(
@Tx
[
0
],
@Tx
[
1
],
30
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vshl_u32
(
@Tx
[
1
],
@Tx
[
1
],
2
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&veor
(
@X
[
0
],
@X
[
0
],
@Tx
[
0
]);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&veor
(
@X
[
0
],
@X
[
0
],
@Tx
[
1
]);
# "X[0]"^=("X[0]">>96)<<<2
foreach
(
@insns
)
{
eval
;
}
# remaining instructions [if any]
$Xi
++
;
push
(
@X
,
shift
(
@X
));
# "rotate" X[]
}
sub
Xupdate_32_79
()
{
use
integer
;
my
$body
=
shift
;
my
@insns
=
(
&$body
,
&$body
,
&$body
,
&$body
);
my
(
$a
,
$b
,
$c
,
$d
,
$e
);
&vext_8
(
@Tx
[
0
],
@X
[
-
2
&
7
],
@X
[
-
1
&
7
],
8
);
# compose "X[-6]"
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&veor
(
@X
[
0
],
@X
[
0
],
@X
[
-
4
&
7
]);
# "X[0]"="X[-32]"^"X[-16]"
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&veor
(
@X
[
0
],
@X
[
0
],
@X
[
-
7
&
7
]);
# "X[0]"^="X[-28]"
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vadd_i32
(
@Tx
[
1
],
@X
[
-
1
&
7
],
$K
);
eval
(
shift
(
@insns
));
&vld1_32
("
{
$K
\
[]}
","
[
$K_XX_XX
,:32]!
")
if
(
$Xi
%
5
==
0
);
eval
(
shift
(
@insns
));
&veor
(
@Tx
[
0
],
@Tx
[
0
],
@X
[
0
]);
# "X[-6]"^="X[0]"
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vshr_u32
(
@X
[
0
],
@Tx
[
0
],
30
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vst1_32
("
{
@Tx
[1]}
","
[
$Xfer
,:128]!
");
# X[]+K xfer
&sub
(
$Xfer
,
$Xfer
,
64
)
if
(
$Xi
%
4
==
0
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vsli_32
(
@X
[
0
],
@Tx
[
0
],
2
);
# "X[0]"="X[-6]"<<<2
foreach
(
@insns
)
{
eval
;
}
# remaining instructions [if any]
$Xi
++
;
push
(
@X
,
shift
(
@X
));
# "rotate" X[]
}
sub
Xuplast_80
()
{
use
integer
;
my
$body
=
shift
;
my
@insns
=
(
&$body
,
&$body
,
&$body
,
&$body
);
my
(
$a
,
$b
,
$c
,
$d
,
$e
);
&vadd_i32
(
@Tx
[
1
],
@X
[
-
1
&
7
],
$K
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vst1_32
("
{
@Tx
[1]}
","
[
$Xfer
,:128]!
");
&sub
(
$Xfer
,
$Xfer
,
64
);
&teq
(
$inp
,
$len
);
&sub
(
$K_XX_XX
,
$K_XX_XX
,
16
);
# rewind $K_XX_XX
&subeq
(
$inp
,
$inp
,
64
);
# reload last block to avoid SEGV
&vld1_8
("
{
@X
[-4&7]-
@X
[-3&7]}
","
[
$inp
]!
");
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vld1_8
("
{
@X
[-2&7]-
@X
[-1&7]}
","
[
$inp
]!
");
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vld1_32
("
{
$K
\
[]}
","
[
$K_XX_XX
,:32]!
");
# load K_00_19
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vrev32_8
(
@X
[
-
4
&
7
],
@X
[
-
4
&
7
]);
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
$Xi
=
0
;
}
sub
Xloop
()
{
use
integer
;
my
$body
=
shift
;
my
@insns
=
(
&$body
,
&$body
,
&$body
,
&$body
);
my
(
$a
,
$b
,
$c
,
$d
,
$e
);
&vrev32_8
(
@X
[(
$Xi
-
3
)
&
7
],
@X
[(
$Xi
-
3
)
&
7
]);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vadd_i32
(
@X
[
$Xi
&
7
],
@X
[(
$Xi
-
4
)
&
7
],
$K
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vst1_32
("
{
@X
[
$Xi
&7]}
","
[
$Xfer
,:128]!
");
# X[]+K xfer to IALU
foreach
(
@insns
)
{
eval
;
}
$Xi
++
;
}
$code
.=
<<___;
#if __ARM_ARCH__>=7
.fpu neon
.type sha1_block_data_order_neon,%function
.align 4
sha1_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
@ dmb @ errata #451034 on early Cortex A8
@ vstmdb sp!,{d8-d15} @ ABI specification says so
mov $saved_sp,sp
sub sp,sp,#64 @ alloca
adr $K_XX_XX,.LK_00_19
bic sp,sp,#15 @ align for 128-bit stores
ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
mov $Xfer,sp
vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
veor $zero,$zero,$zero
vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
vrev32.8 @X[-2&7],@X[-2&7]
vadd.i32 @X[0],@X[-4&7],$K
vrev32.8 @X[-1&7],@X[-1&7]
vadd.i32 @X[1],@X[-3&7],$K
vst1.32 {@X[0]},[$Xfer,:128]!
vadd.i32 @X[2],@X[-2&7],$K
vst1.32 {@X[1]},[$Xfer,:128]!
vst1.32 {@X[2]},[$Xfer,:128]!
ldr $Ki,[sp] @ big RAW stall
.Loop_neon:
___
&Xupdate_16_31
(
\
&body_00_19
);
&Xupdate_16_31
(
\
&body_00_19
);
&Xupdate_16_31
(
\
&body_00_19
);
&Xupdate_16_31
(
\
&body_00_19
);
&Xupdate_32_79
(
\
&body_00_19
);
&Xupdate_32_79
(
\
&body_20_39
);
&Xupdate_32_79
(
\
&body_20_39
);
&Xupdate_32_79
(
\
&body_20_39
);
&Xupdate_32_79
(
\
&body_20_39
);
&Xupdate_32_79
(
\
&body_20_39
);
&Xupdate_32_79
(
\
&body_40_59
);
&Xupdate_32_79
(
\
&body_40_59
);
&Xupdate_32_79
(
\
&body_40_59
);
&Xupdate_32_79
(
\
&body_40_59
);
&Xupdate_32_79
(
\
&body_40_59
);
&Xupdate_32_79
(
\
&body_20_39
);
&Xuplast_80
(
\
&body_20_39
);
&Xloop
(
\
&body_20_39
);
&Xloop
(
\
&body_20_39
);
&Xloop
(
\
&body_20_39
);
$code
.=
<<___;
ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
add $a,$a,$Ki
ldr $Ki,[$ctx,#16]
add $b,$b,$t0
add $c,$c,$t1
add $d,$d,$Xfer
moveq sp,$saved_sp
add $e,$e,$Ki
ldrne $Ki,[sp]
stmia $ctx,{$a,$b,$c,$d,$e}
addne $Xfer,sp,#3*16
bne .Loop_neon
@ vldmia sp!,{d8-d15}
ldmia sp!,{r4-r12,pc}
.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
#endif
___
}}}
#####################################################################
# ARMv8 stuff
#
{{{
my
(
$ABCD
,
$E
,
$E0
,
$E1
)
=
map
("
q
$_
",(
0
..
3
));
my
@MSG
=
map
("
q
$_
",(
4
..
7
));
my
@Kxx
=
map
("
q
$_
",(
8
..
11
));
my
(
$W0
,
$W1
,
$ABCD_SAVE
)
=
map
("
q
$_
",(
12
..
14
));
$code
.=
<<___;
#if __ARM_ARCH__>=7
.type sha1_block_data_order_armv8,%function
.align 5
sha1_block_data_order_armv8:
.LARMv8:
vstmdb sp!,{d8-d15} @ ABI specification says so
veor $E,$E,$E
adr r3,.LK_00_19
vld1.32 {$ABCD},[$ctx]!
vld1.32 {$E\[0]},[$ctx]
sub $ctx,$ctx,#16
vld1.32 {@Kxx[0]\[]},[r3,:32]!
vld1.32 {@Kxx[1]\[]},[r3,:32]!
vld1.32 {@Kxx[2]\[]},[r3,:32]!
vld1.32 {@Kxx[3]\[]},[r3,:32]
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
vrev32.8 @MSG[0],@MSG[0]
vrev32.8 @MSG[1],@MSG[1]
vadd.i32 $W0,@Kxx[0],@MSG[0]
vrev32.8 @MSG[2],@MSG[2]
vmov $ABCD_SAVE,$ABCD @ offload
subs $len,$len,#1
vadd.i32 $W1,@Kxx[0],@MSG[1]
vrev32.8 @MSG[3],@MSG[3]
sha1h $E1,$ABCD @ 0
sha1c $ABCD,$E,$W0
vadd.i32 $W0,@Kxx[$j],@MSG[2]
sha1su0 @MSG[0],@MSG[1],@MSG[2]
___
for
(
$j
=
0
,
$i
=
1
;
$i
<
20
-
3
;
$i
++
)
{
my
$f
=
("
c
","
p
","
m
","
p
")[
$i
/
5
];
$code
.=
<<___;
sha1h $E0,$ABCD @ $i
sha1$f $ABCD,$E1,$W1
vadd.i32 $W1,@Kxx[$j],@MSG[3]
sha1su1 @MSG[0],@MSG[3]
___
$code
.=<<
___
if
(
$i
<
20
-
4
);
sha1su0
@MSG
[
1
],
@MSG
[
2
],
@MSG
[
3
]
___
___
(
$E0
,
$E1
)
=
(
$E1
,
$E0
);
(
$W0
,
$W1
)
=
(
$W1
,
$W0
);
push
(
@MSG
,
shift
(
@MSG
));
$j
++
if
(((
$i
+
3
)
%
5
)
==
0
);
}
$code
.=
<<___;
sha1h $E0,$ABCD @ $i
sha1p $ABCD,$E1,$W1
vadd.i32 $W1,@Kxx[$j],@MSG[3]
sha1h $E1,$ABCD @ 18
sha1p $ABCD,$E0,$W0
sha1h $E0,$ABCD @ 19
sha1p $ABCD,$E1,$W1
vadd.i32 $E,$E,$E0
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
bne .Loop_v8
vst1.32 {$ABCD},[$ctx]!
vst1.32 {$E\[0]},[$ctx]
vldmia sp!,{d8-d15}
bx lr
.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
#endif
___
}}}
$code
.=
<<___;
.comm OPENSSL_armcap_P,4,4
___
{
my
%opcode
=
(
"
sha1c
"
=>
0xf2000c40
,
"
sha1p
"
=>
0xf2100c40
,
"
sha1m
"
=>
0xf2200c40
,
"
sha1su0
"
=>
0xf2300c40
,
"
sha1h
"
=>
0xf3b902c0
,
"
sha1su1
"
=>
0xf3ba0380
);
sub
unsha1
{
my
(
$mnemonic
,
$arg
)
=
@_
;
$arg
=~
m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o
&&
sprintf
"
.long
\t
0x%08x
\t
@ %s %s
",
$opcode
{
$mnemonic
}
|
((
$
1
&
7
)
<<
13
)
|
((
$
1
&
8
)
<<
19
)
|
((
$
2
&
7
)
<<
17
)
|
((
$
2
&
8
)
<<
4
)
|
((
$
3
&
7
)
<<
1
)
|
((
$
3
&
8
)
<<
2
),
$mnemonic
,
$arg
;
}
}
foreach
(
split
(
$/
,
$code
))
{
s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/
eo
or
s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/
eo
;
s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/g
eo
;
s/\bbx\s+lr\b/.word\t0xe12fff1e/o
;
# make it possible to compile with -march=armv4
print
$_
,
$/
;
}
$code
=~
s/\bbx\s+lr\b/.word\t0xe12fff1e/gm
;
# make it possible to compile with -march=armv4
print
$code
;
close
STDOUT
;
# enforce flush
close
STDOUT
;
# enforce flush
crypto/sha/asm/sha256-armv4.pl
浏览文件 @
9250a306
...
@@ -31,6 +31,10 @@
...
@@ -31,6 +31,10 @@
# code (meaning that latter performs sub-optimally, nothing was done
# code (meaning that latter performs sub-optimally, nothing was done
# about it).
# about it).
# May 2014.
#
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
while
((
$output
=
shift
)
&&
(
$output
!~
/^\w[\w\-]*\.\w+$/
))
{}
while
((
$output
=
shift
)
&&
(
$output
!~
/^\w[\w\-]*\.\w+$/
))
{}
open
STDOUT
,"
>
$output
";
open
STDOUT
,"
>
$output
";
...
@@ -185,6 +189,8 @@ sha256_block_data_order:
...
@@ -185,6 +189,8 @@ sha256_block_data_order:
#if __ARM_ARCH__>=7
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#8
bne .LARMv8
tst r12,#1
tst r12,#1
bne .LNEON
bne .LNEON
#endif
#endif
...
@@ -241,6 +247,7 @@ $code.=<<___;
...
@@ -241,6 +247,7 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
bx lr @ interoperable with Thumb ISA:-)
#endif
#endif
.size sha256_block_data_order,.-sha256_block_data_order
___
___
######################################################################
######################################################################
# NEON stuff
# NEON stuff
...
@@ -418,7 +425,10 @@ sub body_00_15 () {
...
@@ -418,7 +425,10 @@ sub body_00_15 () {
$code
.=
<<___;
$code
.=
<<___;
#if __ARM_ARCH__>=7
#if __ARM_ARCH__>=7
.fpu neon
.fpu neon
.type sha256_block_data_order_neon,%function
.align 4
.align 4
sha256_block_data_order_neon:
.LNEON:
.LNEON:
stmdb sp!,{r4-r12,lr}
stmdb sp!,{r4-r12,lr}
...
@@ -521,17 +531,120 @@ $code.=<<___;
...
@@ -521,17 +531,120 @@ $code.=<<___;
bne .L_00_48
bne .L_00_48
ldmia sp!,{r4-r12,pc}
ldmia sp!,{r4-r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif
#endif
___
___
}}}
}}}
######################################################################
# ARMv8 stuff
#
{{{
my
(
$ABCD
,
$EFGH
,
$abcd
)
=
map
("
q
$_
",(
0
..
2
));
my
@MSG
=
map
("
q
$_
",(
8
..
11
));
my
(
$W0
,
$W1
,
$ABCD_SAVE
,
$EFGH_SAVE
)
=
map
("
q
$_
",(
12
..
15
));
my
$Ktbl
=
"
r3
";
$code
.=
<<___;
$code
.=
<<___;
.size sha256_block_data_order,.-sha256_block_data_order
#if __ARM_ARCH__>=7
.asciz "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.type sha256_block_data_order_armv8,%function
.align 5
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {$ABCD,$EFGH},[$ctx]
sub $Ktbl,r3,#sha256_block_data_order-K256
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
vld1.32 {$W0},[$Ktbl]!
vrev32.8 @MSG[0],@MSG[0]
vrev32.8 @MSG[1],@MSG[1]
vrev32.8 @MSG[2],@MSG[2]
vrev32.8 @MSG[3],@MSG[3]
vmov $ABCD_SAVE,$ABCD @ offload
vmov $EFGH_SAVE,$EFGH
teq $inp,$len
___
for
(
$i
=
0
;
$i
<
12
;
$i
++
)
{
$code
.=
<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
(
$W0
,
$W1
)
=
(
$W1
,
$W0
);
push
(
@MSG
,
shift
(
@MSG
));
}
$code
.=
<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vld1.32 {$W0},[$Ktbl]!
vadd.i32 $W1,$W1,@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vld1.32 {$W1},[$Ktbl]
vadd.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#256-16 @ rewind
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vadd.i32 $W1,$W1,@MSG[3]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
bne .Loop_v8
vst1.32 {$ABCD,$EFGH},[$ctx]
bx lr
.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
#endif
___
}}}
$code
.=
<<___;
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
.align 2
.comm OPENSSL_armcap_P,4,4
.comm OPENSSL_armcap_P,4,4
___
___
$code
=~
s/\`([^\`]*)\`/eval $1/g
em
;
{
my
%opcode
=
(
$code
=~
s/\bbx\s+lr\b/.word\t0xe12fff1e/gm
;
# make it possible to compile with -march=armv4
"
sha256h
"
=>
0xf3000c40
,
"
sha256h2
"
=>
0xf3100c40
,
print
$code
;
"
sha256su0
"
=>
0xf3ba03c0
,
"
sha256su1
"
=>
0xf3200c40
);
sub
unsha256
{
my
(
$mnemonic
,
$arg
)
=
@_
;
$arg
=~
m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o
&&
sprintf
"
.long
\t
0x%08x
\t
@ %s %s
",
$opcode
{
$mnemonic
}
|
((
$
1
&
7
)
<<
13
)
|
((
$
1
&
8
)
<<
19
)
|
((
$
2
&
7
)
<<
17
)
|
((
$
2
&
8
)
<<
4
)
|
((
$
3
&
7
)
<<
1
)
|
((
$
3
&
8
)
<<
2
),
$mnemonic
,
$arg
;
}
}
foreach
(
split
(
$/
,
$code
))
{
s/\`([^\`]*)\`/eval $1/g
eo
;
s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/g
eo
;
s/\bbx\s+lr\b/.word\t0xe12fff1e/go
;
# make it possible to compile with -march=armv4
print
$_
,"
\n
";
}
close
STDOUT
;
# enforce flush
close
STDOUT
;
# enforce flush
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录