提交 9d0e4dc6 编写于 作者: A Andy Polyakov

bn/asm/s390x.S: improve performance on z196 and z13 by up to 26%. [even z10 is...

bn/asm/s390x.S: improve performance on z196 and z13 by up to 26%. [even z10 is couple percent faster]. Triggered by RT#4128, but solves the problem by real modulo-scheduling.
Reviewed-by: NRich Salz <rsalz@openssl.org>
上级 a5fd24d1
...@@ -18,71 +18,106 @@ ...@@ -18,71 +18,106 @@
.align 4 .align 4
bn_mul_add_words: bn_mul_add_words:
lghi zero,0 // zero = 0 lghi zero,0 // zero = 0
la %r1,0(%r2) // put rp aside la %r1,0(%r2) // put rp aside [to give way to]
lghi %r2,0 // i=0; lghi %r2,0 // return value
ltgfr %r4,%r4 ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0; bler %r14 // if (len<=0) return 0;
stmg %r6,%r10,48(%r15) stmg %r6,%r13,48(%r15)
lghi %r10,3 lghi %r2,3
lghi %r8,0 // carry = 0 lghi %r12,0 // carry = 0
nr %r10,%r4 // len%4 slgr %r1,%r3 // rp-=ap
nr %r2,%r4 // len%4
sra %r4,2 // cnt=len/4 sra %r4,2 // cnt=len/4
jz .Loop1_madd // carry is incidentally cleared if branch taken jz .Loop1_madd // carry is incidentally cleared if branch taken
algr zero,zero // clear carry algr zero,zero // clear carry
.Loop4_madd: lg %r7,0(%r3) // ap[0]
lg %r7,0(%r2,%r3) // ap[i] lg %r9,8(%r3) // ap[1]
mlgr %r6,%r5 // *=w mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry brct %r4,.Loop4_madd
alcgr %r6,zero j .Loop4_madd_tail
alg %r7,0(%r2,%r1) // +=rp[i]
stg %r7,0(%r2,%r1) // rp[i]=
lg %r9,8(%r2,%r3) .Loop4_madd:
mlgr %r8,%r5 mlgr %r8,%r5
lg %r11,16(%r3) // ap[i+2]
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
mlgr %r10,%r5
lg %r13,24(%r3)
alcgr %r9,%r6 alcgr %r9,%r6
alcgr %r8,zero alcgr %r8,zero
alg %r9,8(%r2,%r1) alg %r9,8(%r3,%r1)
stg %r9,8(%r2,%r1) stg %r9,8(%r3,%r1)
mlgr %r12,%r5
lg %r7,32(%r3)
alcgr %r11,%r8
alcgr %r10,zero
alg %r11,16(%r3,%r1)
stg %r11,16(%r3,%r1)
lg %r7,16(%r2,%r3)
mlgr %r6,%r5 mlgr %r6,%r5
alcgr %r7,%r8 lg %r9,40(%r3)
alcgr %r6,zero alcgr %r13,%r10
alg %r7,16(%r2,%r1) alcgr %r12,zero
stg %r7,16(%r2,%r1) alg %r13,24(%r3,%r1)
stg %r13,24(%r3,%r1)
lg %r9,24(%r2,%r3) la %r3,32(%r3) // i+=4
brct %r4,.Loop4_madd
.Loop4_madd_tail:
mlgr %r8,%r5 mlgr %r8,%r5
lg %r11,16(%r3)
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
mlgr %r10,%r5
lg %r13,24(%r3)
alcgr %r9,%r6 alcgr %r9,%r6
alcgr %r8,zero alcgr %r8,zero
alg %r9,24(%r2,%r1) alg %r9,8(%r3,%r1)
stg %r9,24(%r2,%r1) stg %r9,8(%r3,%r1)
la %r2,32(%r2) // i+=4 mlgr %r12,%r5
brct %r4,.Loop4_madd alcgr %r11,%r8
alcgr %r10,zero
alg %r11,16(%r3,%r1)
stg %r11,16(%r3,%r1)
la %r10,1(%r10) // see if len%4 is zero ... alcgr %r13,%r10
brct %r10,.Loop1_madd // without touching condition code:-) alcgr %r12,zero
alg %r13,24(%r3,%r1)
stg %r13,24(%r3,%r1)
la %r3,32(%r3) // i+=4
la %r2,1(%r2) // see if len%4 is zero ...
brct %r2,.Loop1_madd // without touching condition code:-)
.Lend_madd: .Lend_madd:
alcgr %r8,zero // collect carry bit lgr %r2,zero // return value
lgr %r2,%r8 alcgr %r2,%r12 // collect even carry bit
lmg %r6,%r10,48(%r15) lmg %r6,%r13,48(%r15)
br %r14 br %r14
.Loop1_madd: .Loop1_madd:
lg %r7,0(%r2,%r3) // ap[i] lg %r7,0(%r3) // ap[i]
mlgr %r6,%r5 // *=w mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry alcgr %r7,%r12 // +=carry
alcgr %r6,zero alcgr %r6,zero
alg %r7,0(%r2,%r1) // +=rp[i] alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r2,%r1) // rp[i]= stg %r7,0(%r3,%r1) // rp[i]=
lgr %r8,%r6 lgr %r12,%r6
la %r2,8(%r2) // i++ la %r3,8(%r3) // i++
brct %r10,.Loop1_madd brct %r2,.Loop1_madd
j .Lend_madd j .Lend_madd
.size bn_mul_add_words,.-bn_mul_add_words .size bn_mul_add_words,.-bn_mul_add_words
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册