Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
btwise
openssl
提交
a000759a
O
openssl
项目概览
btwise
/
openssl
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
a000759a
编写于
3月 04, 2011
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
ia64-mont.pl: optimize short-key performance.
上级
bc5b136c
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
150 addition
and
63 deletion
+150
-63
crypto/bn/asm/ia64-mont.pl
crypto/bn/asm/ia64-mont.pl
+150
-63
未找到文件。
crypto/bn/asm/ia64-mont.pl
浏览文件 @
a000759a
...
...
@@ -38,15 +38,15 @@
# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
# this module is:
# sign verify sign/s verify/s
# rsa 512 bits 0.000
302s 0.000024s 3312.3 41332.2
# rsa 1024 bits 0.000
816s 0.000058s 1225.2
17172.0
# rsa 512 bits 0.000
290s 0.000024s 3452.8 42031.4
# rsa 1024 bits 0.000
793s 0.000058s 1261.7
17172.0
# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
# dsa 512 bits 0.00025
4s 0.000206s 3944.6 4865.1
# dsa 512 bits 0.00025
3s 0.000198s 3949.9 5057.0
# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
#
# ... and *without*:
# ... and *without*
(but still with ia64.S)
:
#
# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
...
...
@@ -56,9 +56,9 @@
# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
#
# As it can be seen, RSA sign performance improves by 1
2
0-30%,
# hereafter less for longer keys, while verify - by 7
2
-13%.
# DSA performance improves by 1
00
-30%.
# As it can be seen, RSA sign performance improves by 1
3
0-30%,
# hereafter less for longer keys, while verify - by 7
4
-13%.
# DSA performance improves by 1
15
-30%.
if
(
$^O
eq
"
hpux
")
{
$ADDP
=
"
addp4
";
...
...
@@ -473,7 +473,7 @@ bn_mul_mont_8:
mov ar.lc=r28 }
{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
(p15)fcvt.fxu bj[0]=f0
mov ar.ec=
2
}
mov ar.ec=
1
}
{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
(p13)fcvt.fxu ni6=f0
mov pr.rot=1<<16 }
...
...
@@ -482,12 +482,12 @@ bn_mul_mont_8:
brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
};;
// The loop is scheduled for 32*
(n+1) ticks on Itanium 2. Actual
//
measurement with help of Interval Time Counter indicate
that the
// The loop is scheduled for 32*
n ticks on Itanium 2. Actual attempt
//
to measure with help of Interval Time Counter indicated
that the
// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
// addressing the issue is problematic, because I don't have access
// to platform-specific instruction-level profiler. On Itanium it
// should run in 56*
(n+1)
ticks, because of higher xma latency...
// should run in 56*
n
ticks, because of higher xma latency...
.Louter_8_ctop:
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
...
...
@@ -652,62 +652,149 @@ bn_mul_mont_8:
br.ctop.sptk.many .Louter_8_ctop };;
.Louter_8_cend:
// move np[8] to GPR bank and subtract it from carrybit|tmp[8]
// above loop has to execute one more time, without (p16), which is
// replaced with merged move of np[8] to GPR bank
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mmi; (p0) getf.sig n1=ni0 // 0:
(p40) add a3=a3,n3 // (p17) a3+=n3
(p42) add a3=a3,n3,1 };;
{ .mii; (p17) getf.sig a7=alo[8] // 1:
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
(p50) add t[6]=t[6],a3,1 };;
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
(p40) cmp.ltu p43,p41=a3,n3 }
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
(p48) cmp.ltu p51,p49=t[6],a3
(p50) cmp.leu p51,p49=t[6],a3 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mmi; (p0) getf.sig n2=ni1 // 4:
(p41) add a4=a4,n4 // (p17) a4+=n4
(p43) add a4=a4,n4,1 };;
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
(p0) nop.f 0
(p51) add t[5]=t[5],a4,1 };;
{ .mfi; (p0) getf.sig n3=ni2 // 6:
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
(p41) cmp.ltu p42,p40=a4,n4 }
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
(p49) cmp.ltu p50,p48=t[5],a4
(p51) cmp.leu p50,p48=t[5],a4 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) getf.sig n4=ni3 // 8:
(p40) add a5=a5,n5 // (p17) a5+=n5
(p42) add a5=a5,n5,1 };;
{ .mii; (p0) nop.m 0 // 9:
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
(p50) add t[4]=t[4],a5,1 };;
{ .mii; (p0) nop.m 0 // 10:
(p40) cmp.ltu p43,p41=a5,n5
(p42) cmp.leu p43,p41=a5,n5 };;
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
(p48) cmp.ltu p51,p49=t[4],a5
(p50) cmp.leu p51,p49=t[4],a5 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p17) getf.sig n8=nhi[8] // 12:
(p41) add a6=a6,n6 // (p17) a6+=n6
(p43) add a6=a6,n6,1 };;
{ .mii; (p0) getf.sig n5=ni4 // 13:
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
(p51) add t[3]=t[3],a6,1 };;
{ .mii; (p0) nop.m 0 // 14:
(p41) cmp.ltu p42,p40=a6,n6
(p43) cmp.leu p42,p40=a6,n6 };;
{ .mii; (p0) getf.sig n6=ni5 // 15:
(p49) cmp.ltu p50,p48=t[3],a6
(p51) cmp.leu p50,p48=t[3],a6 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) nop.m 0 // 16:
(p40) add a7=a7,n7 // (p17) a7+=n7
(p42) add a7=a7,n7,1 };;
{ .mii; (p0) nop.m 0 // 17:
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
(p50) add t[2]=t[2],a7,1 };;
{ .mii; (p0) nop.m 0 // 18:
(p40) cmp.ltu p43,p41=a7,n7
(p42) cmp.leu p43,p41=a7,n7 };;
{ .mii; (p0) getf.sig n7=ni6 // 19:
(p48) cmp.ltu p51,p49=t[2],a7
(p50) cmp.leu p51,p49=t[2],a7 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p0) nop.m 0 // 20:
(p41) add a8=a8,n8 // (p17) a8+=n8
(p43) add a8=a8,n8,1 };;
{ .mmi; (p0) nop.m 0 // 21:
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
(p51) add t[1]=t[1],a8,1 }
{ .mmi; (p17) mov t[0]=r0
(p41) cmp.ltu p42,p40=a8,n8
(p43) cmp.leu p42,p40=a8,n8 };;
{ .mmi; (p0) getf.sig n8=ni7 // 22:
(p49) cmp.ltu p50,p48=t[1],a8
(p51) cmp.leu p50,p48=t[1],a8 }
{ .mmi; (p42) add t[0]=t[0],r0,1
(p0) add r16=-7*16,prevsp
(p0) add r17=-6*16,prevsp };;
// subtract np[8] from carrybit|tmp[8]
// carrybit|tmp[8] layout upon exit from above loop is:
// t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t[0]|t0 (least significant)
{ .mmi; getf.sig n1=ni0
getf.sig n2=ni1
add r16=-7*16,prevsp}
{ .mmi; getf.sig n3=ni2
getf.sig n4=ni3
add r17=-6*16,prevsp};;
{ .mmi; getf.sig n5=ni4
getf.sig n6=ni5
add r18=-5*16,prevsp}
{ .mmi; getf.sig n7=ni6
getf.sig n8=ni7
// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
{ .mmi; (p50)add t[0]=t[0],r0,1
add r18=-5*16,prevsp
sub n1=t0,n1 };;
{ .mmi; cmp.gtu p34,p32=n1,t0;;
.pred.rel "mutex",p32,p34
(p32)sub n2=t[
0
],n2
(p34)sub n2=t[
0
],n2,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n2,t[
0
]
(p34)cmp.geu p35,p33=n2,t[
0
];;
(p32)sub n2=t[
7
],n2
(p34)sub n2=t[
7
],n2,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n2,t[
7
]
(p34)cmp.geu p35,p33=n2,t[
7
];;
.pred.rel "mutex",p33,p35
(p33)sub n3=t[
7
],n3 }
{ .mmi; (p35)sub n3=t[
7
],n3,1;;
(p33)cmp.gtu p34,p32=n3,t[
7
]
(p35)cmp.geu p34,p32=n3,t[
7
] };;
(p33)sub n3=t[
6
],n3 }
{ .mmi; (p35)sub n3=t[
6
],n3,1;;
(p33)cmp.gtu p34,p32=n3,t[
6
]
(p35)cmp.geu p34,p32=n3,t[
6
] };;
.pred.rel "mutex",p32,p34
{ .mii; (p32)sub n4=t[
6
],n4
(p34)sub n4=t[
6
],n4,1;;
(p32)cmp.gtu p35,p33=n4,t[
6
] }
{ .mmi; (p34)cmp.geu p35,p33=n4,t[
6
];;
{ .mii; (p32)sub n4=t[
5
],n4
(p34)sub n4=t[
5
],n4,1;;
(p32)cmp.gtu p35,p33=n4,t[
5
] }
{ .mmi; (p34)cmp.geu p35,p33=n4,t[
5
];;
.pred.rel "mutex",p33,p35
(p33)sub n5=t[
5
],n5
(p35)sub n5=t[
5
],n5,1 };;
{ .mii; (p33)cmp.gtu p34,p32=n5,t[
5
]
(p35)cmp.geu p34,p32=n5,t[
5
];;
(p33)sub n5=t[
4
],n5
(p35)sub n5=t[
4
],n5,1 };;
{ .mii; (p33)cmp.gtu p34,p32=n5,t[
4
]
(p35)cmp.geu p34,p32=n5,t[
4
];;
.pred.rel "mutex",p32,p34
(p32)sub n6=t[
4
],n6 }
{ .mmi; (p34)sub n6=t[
4
],n6,1;;
(p32)cmp.gtu p35,p33=n6,t[
4
]
(p34)cmp.geu p35,p33=n6,t[
4
] };;
(p32)sub n6=t[
3
],n6 }
{ .mmi; (p34)sub n6=t[
3
],n6,1;;
(p32)cmp.gtu p35,p33=n6,t[
3
]
(p34)cmp.geu p35,p33=n6,t[
3
] };;
.pred.rel "mutex",p33,p35
{ .mii; (p33)sub n7=t[
3
],n7
(p35)sub n7=t[
3
],n7,1;;
(p33)cmp.gtu p34,p32=n7,t[
3
] }
{ .mmi; (p35)cmp.geu p34,p32=n7,t[
3
];;
{ .mii; (p33)sub n7=t[
2
],n7
(p35)sub n7=t[
2
],n7,1;;
(p33)cmp.gtu p34,p32=n7,t[
2
] }
{ .mmi; (p35)cmp.geu p34,p32=n7,t[
2
];;
.pred.rel "mutex",p32,p34
(p32)sub n8=t[
2
],n8
(p34)sub n8=t[
2
],n8,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n8,t[
2
]
(p34)cmp.geu p35,p33=n8,t[
2
];;
(p32)sub n8=t[
1
],n8
(p34)sub n8=t[
1
],n8,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n8,t[
1
]
(p34)cmp.geu p35,p33=n8,t[
1
];;
.pred.rel "mutex",p33,p35
(p33)sub a8=t[
1
],r0 }
{ .mmi; (p35)sub a8=t[
1
],r0,1;;
(p33)cmp.gtu p34,p32=a8,t[
1
]
(p35)cmp.geu p34,p32=a8,t[
1
] };;
(p33)sub a8=t[
0
],r0 }
{ .mmi; (p35)sub a8=t[
0
],r0,1;;
(p33)cmp.gtu p34,p32=a8,t[
0
]
(p35)cmp.geu p34,p32=a8,t[
0
] };;
// save the result, either tmp[num] or tmp[num]-np[num]
.pred.rel "mutex",p32,p34
...
...
@@ -715,25 +802,25 @@ bn_mul_mont_8:
(p34)st8 [rptr]=t0,8
add r19=-4*16,prevsp};;
{ .mmb; (p32)st8 [rptr]=n2,8
(p34)st8 [rptr]=t[
0
],8
(p34)st8 [rptr]=t[
7
],8
(p5)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n3,8
(p34)st8 [rptr]=t[
7
],8
(p34)st8 [rptr]=t[
6
],8
(p7)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n4,8
(p34)st8 [rptr]=t[
6
],8
(p34)st8 [rptr]=t[
5
],8
(p9)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n5,8
(p34)st8 [rptr]=t[
5
],8
(p34)st8 [rptr]=t[
4
],8
(p11)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n6,8
(p34)st8 [rptr]=t[
4
],8
(p34)st8 [rptr]=t[
3
],8
(p13)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n7,8
(p34)st8 [rptr]=t[
3
],8
(p34)st8 [rptr]=t[
2
],8
(p15)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n8,8
(p34)st8 [rptr]=t[
2
],8
(p34)st8 [rptr]=t[
1
],8
nop.b 0 };;
.Ldone: // epilogue
{ .mmi; ldf.fill f16=[r16],64
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录