提交 d0c2ebf4 编写于 作者: R Richard Levitte

A patch from HP for better performance.

Submitted by Kevin Steves <ks@hp.se> 3 months ago...
上级 e17b7128
...@@ -212,12 +212,21 @@ my %table=( ...@@ -212,12 +212,21 @@ my %table=(
# crypto/sha/sha_lcl.h. # crypto/sha/sha_lcl.h.
# <appro@fy.chalmers.se> # <appro@fy.chalmers.se>
# #
"hpux-parisc-cc","cc:-Ae +O3 +ESlit -z -DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl", #!#"hpux-parisc-cc","cc:-Ae +O3 +ESlit -z -DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl",
# Since there is mention of this in shlib/hpux10-cc.sh # Since there is mention of this in shlib/hpux10-cc.sh
"hpux-parisc-cc-o4","cc:-Ae +O4 +ESlit -z -DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl", "hpux-parisc-cc-o4","cc:-Ae +O4 +ESlit -z -DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl",
"hpux-parisc-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl", "hpux-parisc-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl",
"hpux64-parisc-cc","cc:-Ae +DD64 +O3 +ESlit -z -DB_ENDIAN -DMD32_XARRAY::-D_REENTRANT:-ldld:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::::::::::dl", "hpux64-parisc-cc","cc:-Ae +DD64 +O3 +ESlit -z -DB_ENDIAN -DMD32_XARRAY::-D_REENTRANT:-ldld:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::::::::::dl",
# More attempts at unified 10.X and 11.X targets for HP C compiler.
#
# Chris Ruemmler <ruemmler@cup.hp.com>
# Kevin Steves <ks@hp.se>
"hpux-parisc-cc","cc:+O3 +Optrs_strongly_typed +Olibcalls -Ae +ESlit -DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY::-D_REENTRANT:-ldl:MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::::::::::dl",
"hpux-parisc2-cc","cc:+DA2.0 +DS2.0 +O3 +Optrs_strongly_typed +Olibcalls -Ae +ESlit -DB_ENDIAN -DMD32_XARRAY::-D_REENTRANT:-ldl:SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT:asm/pa-risc2.o:::::::::dl",
"hpux64-parisc2-cc","cc:+DD64 +O3 +Optrs_strongly_typed +Olibcalls -Ae +ESlit -DB_ENDIAN -DMD32_XARRAY::-D_REENTRANT:-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT:asm/pa-risc2W.o:::::::::dl",
"hpux-parisc1_1-cc","cc:+DA1.1 +DS1.1 +O3 +Optrs_strongly_typed +Olibcalls -Ae +ESlit -DB_ENDIAN -DMD32_XARRAY::-D_REENTRANT:-ldl:MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::::::::::dl",
# HPUX 9.X config. # HPUX 9.X config.
# Don't use the bundled cc. It is broken. Use HP ANSI C if possible, or # Don't use the bundled cc. It is broken. Use HP ANSI C if possible, or
# egcs. gcc 2.8.1 is also broken. # egcs. gcc 2.8.1 is also broken.
......
...@@ -15,9 +15,9 @@ On the 2 alpha C compilers I had access to, it was not possible to do ...@@ -15,9 +15,9 @@ On the 2 alpha C compilers I had access to, it was not possible to do
were 64 bits). So the hand assember gives access to the 128 bit result and were 64 bits). So the hand assember gives access to the 128 bit result and
a 2 times speedup :-). a 2 times speedup :-).
There are 2 versions of assember for the HP PA-RISC. There are 3 versions of assember for the HP PA-RISC.
pa-risc.s is the origional one which works fine.
pa-risc2.s is a new version that often generates warnings but if the pa-risc.s is the origional one which works fine and generated using gcc :-)
tests pass, it gives performance that is over 2 times faster than
pa-risc.s. pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations
Both were generated using gcc :-) by Chris Ruemmler from HP (with some help from the HP C compiler).
.SPACE $PRIVATE$ ;
.SUBSPA $DATA$,QUAD=1,ALIGN=8,ACCESS=31 ; PA-RISC 2.0 implementation of bn_asm code, based on the
.SUBSPA $BSS$,QUAD=1,ALIGN=8,ACCESS=31,ZERO,SORT=82 ; 64-bit version of the code. This code is effectively the
.SPACE $TEXT$ ; same as the 64-bit version except the register model is
.SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=44 ; slightly different given all values must be 32-bit between
.SUBSPA $CODE$,QUAD=0,ALIGN=8,ACCESS=44,CODE_ONLY ; function calls. Thus the 64-bit return values are returned
.IMPORT $global$,DATA ; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
.IMPORT $$dyncall,MILLICODE ;
; gcc_compiled.: ;
.SPACE $TEXT$ ; This code is approximately 2x faster than the C version
.SUBSPA $CODE$ ; for RSA/DSA.
;
.align 4 ; See http://devresource.hp.com/ for more details on the PA-RISC
.EXPORT bn_mul_add_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR ; architecture. Also see the book "PA-RISC 2.0 Architecture"
; by Gerry Kane for information on the instruction set architecture.
;
; Code written by Chris Ruemmler (with some help from the HP C
; compiler).
;
; The code compiles with HP's assembler
;
.level 2.0N
.space $TEXT$
.subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
;
; Global Register definitions used for the routines.
;
; Some information about HP's runtime architecture for 32-bits.
;
; "Caller save" means the calling function must save the register
; if it wants the register to be preserved.
; "Callee save" means if a function uses the register, it must save
; the value before using it.
;
; For the floating point registers
;
; "caller save" registers: fr4-fr11, fr22-fr31
; "callee save" registers: fr12-fr21
; "special" registers: fr0-fr3 (status and exception registers)
;
; For the integer registers
; value zero : r0
; "caller save" registers: r1,r19-r26
; "callee save" registers: r3-r18
; return register : r2 (rp)
; return values ; r28,r29 (ret0,ret1)
; Stack pointer ; r30 (sp)
; millicode return ptr ; r31 (also a caller save register)
;
; Arguments to the routines
;
r_ptr .reg %r26
a_ptr .reg %r25
b_ptr .reg %r24
num .reg %r24
n .reg %r23
;
; Note that the "w" argument for bn_mul_add_words and bn_mul_words
; is passed on the stack at a delta of -56 from the top of stack
; as the routine is entered.
;
;
; Globals used in some routines
;
top_overflow .reg %r23
high_mask .reg %r22 ; value 0xffffffff80000000L
;------------------------------------------------------------------------------
;
; bn_mul_add_words
;
;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
; int num, BN_ULONG w)
;
; arg0 = r_ptr
; arg1 = a_ptr
; arg3 = num
; -56(sp) = w
;
; Local register definitions
;
fm1 .reg %fr22
fm .reg %fr23
ht_temp .reg %fr24
ht_temp_1 .reg %fr25
lt_temp .reg %fr26
lt_temp_1 .reg %fr27
fm1_1 .reg %fr28
fm_1 .reg %fr29
fw_h .reg %fr7L
fw_l .reg %fr7R
fw .reg %fr7
fht_0 .reg %fr8L
flt_0 .reg %fr8R
t_float_0 .reg %fr8
fht_1 .reg %fr9L
flt_1 .reg %fr9R
t_float_1 .reg %fr9
tmp_0 .reg %r31
tmp_1 .reg %r21
m_0 .reg %r20
m_1 .reg %r19
ht_0 .reg %r1
ht_1 .reg %r3
lt_0 .reg %r4
lt_1 .reg %r5
m1_0 .reg %r6
m1_1 .reg %r7
rp_val .reg %r8
rp_val_1 .reg %r9
bn_mul_add_words bn_mul_add_words
.PROC .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
.CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=4 .proc
.ENTRY .callinfo frame=128
stw %r2,-20(0,%r30) .entry
stwm %r4,64(0,%r30) .align 64
copy %r24,%r31
stw %r3,-60(0,%r30) STD %r3,0(%sp) ; save r3
ldi 0,%r20 STD %r4,8(%sp) ; save r4
ldo 12(%r26),%r2 NOP ; Needed to make the loop 16-byte aligned
stw %r23,-16(0,%r30) NOP ; needed to make the loop 16-byte aligned
copy %r25,%r3
ldo 12(%r3),%r1 STD %r5,16(%sp) ; save r5
fldws -16(0,%r30),%fr8L NOP
L$0010 STD %r6,24(%sp) ; save r6
copy %r20,%r25 STD %r7,32(%sp) ; save r7
ldi 0,%r24
fldws 0(0,%r3),%fr9L STD %r8,40(%sp) ; save r8
ldw 0(0,%r26),%r19 STD %r9,48(%sp) ; save r9
xmpyu %fr8L,%fr9L,%fr9 COPY %r0,%ret1 ; return 0 by default
fstds %fr9,-16(0,%r30) DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
copy %r19,%r23
ldw -16(0,%r30),%r28 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
ldw -12(0,%r30),%r29 LDO 128(%sp),%sp ; bump stack
ldi 0,%r22
add %r23,%r29,%r29 ;
addc %r22,%r28,%r28 ; The loop is unrolled twice, so if there is only 1 number
add %r25,%r29,%r29 ; then go straight to the cleanup code.
addc %r24,%r28,%r28 ;
copy %r28,%r21 CMPIB,= 1,num,bn_mul_add_words_single_top
ldi 0,%r20 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
copy %r21,%r20
addib,= -1,%r31,L$0011 ;
stw %r29,0(0,%r26) ; This loop is unrolled 2 times (64-byte aligned as well)
copy %r20,%r25 ;
ldi 0,%r24 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
fldws -8(0,%r1),%fr9L ; two 32-bit mutiplies can be issued per cycle.
ldw -8(0,%r2),%r19 ;
xmpyu %fr8L,%fr9L,%fr9 bn_mul_add_words_unroll2
fstds %fr9,-16(0,%r30)
copy %r19,%r23 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
ldw -16(0,%r30),%r28 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
ldw -12(0,%r30),%r29 LDD 0(r_ptr),rp_val ; rp[0]
ldi 0,%r22 LDD 8(r_ptr),rp_val_1 ; rp[1]
add %r23,%r29,%r29
addc %r22,%r28,%r28 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
add %r25,%r29,%r29 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
addc %r24,%r28,%r28 FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
copy %r28,%r21 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
ldi 0,%r20
copy %r21,%r20 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
addib,= -1,%r31,L$0011 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
stw %r29,-8(0,%r2) FSTD fm,-8(%sp) ; -8(sp) = m[0]
copy %r20,%r25 FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
ldi 0,%r24
fldws -4(0,%r1),%fr9L XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
ldw -4(0,%r2),%r19 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
xmpyu %fr8L,%fr9L,%fr9 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
fstds %fr9,-16(0,%r30) FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
copy %r19,%r23
ldw -16(0,%r30),%r28 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
ldw -12(0,%r30),%r29 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
ldi 0,%r22 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
add %r23,%r29,%r29 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
addc %r22,%r28,%r28
add %r25,%r29,%r29 LDD -8(%sp),m_0 ; m[0]
addc %r24,%r28,%r28 LDD -40(%sp),m_1 ; m[1]
copy %r28,%r21 LDD -16(%sp),m1_0 ; m1[0]
ldi 0,%r20 LDD -48(%sp),m1_1 ; m1[1]
copy %r21,%r20
addib,= -1,%r31,L$0011 LDD -24(%sp),ht_0 ; ht[0]
stw %r29,-4(0,%r2) LDD -56(%sp),ht_1 ; ht[1]
copy %r20,%r25 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
ldi 0,%r24 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
fldws 0(0,%r1),%fr9L
ldw 0(0,%r2),%r19 LDD -32(%sp),lt_0
xmpyu %fr8L,%fr9L,%fr9 LDD -64(%sp),lt_1
fstds %fr9,-16(0,%r30) CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
copy %r19,%r23 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
ldw -16(0,%r30),%r28
ldw -12(0,%r30),%r29 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
ldi 0,%r22 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
add %r23,%r29,%r29 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
addc %r22,%r28,%r28 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
add %r25,%r29,%r29
addc %r24,%r28,%r28 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
copy %r28,%r21 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
ldi 0,%r20 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
copy %r21,%r20 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
addib,= -1,%r31,L$0011
stw %r29,0(0,%r2) ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
ldo 16(%r1),%r1 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
ldo 16(%r3),%r3 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
ldo 16(%r2),%r2 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
bl L$0010,0
ldo 16(%r26),%r26 ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c;
L$0011 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
copy %r20,%r28 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
ldw -84(0,%r30),%r2 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
ldw -60(0,%r30),%r3
bv 0(%r2) LDO -2(num),num ; num = num - 2;
ldwm -64(0,%r30),%r4 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
.EXIT ADD,DC ht_1,%r0,ht_1 ; ht[1]++
.PROCEND STD lt_0,0(r_ptr) ; rp[0] = lt[0]
.align 4
.EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
ADD,DC ht_1,%r0,%ret1 ; ht[1]++
LDO 16(a_ptr),a_ptr ; a_ptr += 2
STD lt_1,8(r_ptr) ; rp[1] = lt[1]
CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
LDO 16(r_ptr),r_ptr ; r_ptr += 2
CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
;
; Top of loop aligned on 64-byte boundary
;
bn_mul_add_words_single_top
FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
LDD 0(r_ptr),rp_val ; rp[0]
LDO 8(a_ptr),a_ptr ; a_ptr++
XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
FSTD fm1,-16(%sp) ; -16(sp) = m1
XMPYU flt_0,fw_h,fm ; m = lt*fw_h
FSTD fm,-8(%sp) ; -8(sp) = m
XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
FSTD ht_temp,-24(%sp) ; -24(sp) = ht
XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
FSTD lt_temp,-32(%sp) ; -32(sp) = lt
LDD -8(%sp),m_0
LDD -16(%sp),m1_0 ; m1 = temp1
ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
LDD -24(%sp),ht_0
LDD -32(%sp),lt_0
CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
EXTRD,U tmp_0,31,32,m_0 ; m>>32
DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
ADD,DC ht_0,%r0,ht_0 ; ht++
ADD %ret1,tmp_0,lt_0 ; lt = lt + c;
ADD,DC ht_0,%r0,ht_0 ; ht++
ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
ADD,DC ht_0,%r0,%ret1 ; ht++
STD lt_0,0(r_ptr) ; rp[0] = lt
bn_mul_add_words_exit
.EXIT
EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
LDD -80(%sp),%r9 ; restore r9
LDD -88(%sp),%r8 ; restore r8
LDD -96(%sp),%r7 ; restore r7
LDD -104(%sp),%r6 ; restore r6
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3 ; restore r3
.PROCEND ;in=23,24,25,26,29;out=28;
;----------------------------------------------------------------------------
;
;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
;
; arg0 = rp
; arg1 = ap
; arg3 = num
; w on stack at -56(sp)
bn_mul_words bn_mul_words
.PROC .proc
.CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=3 .callinfo frame=128
.ENTRY .entry
stw %r2,-20(0,%r30) .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
copy %r25,%r2 .align 64
stwm %r4,64(0,%r30)
copy %r24,%r19 STD %r3,0(%sp) ; save r3
ldi 0,%r28 STD %r4,8(%sp) ; save r4
stw %r23,-16(0,%r30) NOP
ldo 12(%r26),%r31 STD %r5,16(%sp) ; save r5
ldo 12(%r2),%r29
fldws -16(0,%r30),%fr8L STD %r6,24(%sp) ; save r6
L$0026 STD %r7,32(%sp) ; save r7
fldws 0(0,%r2),%fr9L COPY %r0,%ret1 ; return 0 by default
xmpyu %fr8L,%fr9L,%fr9 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
fstds %fr9,-16(0,%r30)
copy %r28,%r21 CMPIB,>= 0,num,bn_mul_words_exit
ldi 0,%r20 LDO 128(%sp),%sp ; bump stack
ldw -16(0,%r30),%r24
ldw -12(0,%r30),%r25 ;
add %r21,%r25,%r25 ; See if only 1 word to do, thus just do cleanup
addc %r20,%r24,%r24 ;
copy %r24,%r23 CMPIB,= 1,num,bn_mul_words_single_top
ldi 0,%r22 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
copy %r23,%r28
addib,= -1,%r19,L$0027 ;
stw %r25,0(0,%r26) ; This loop is unrolled 2 times (64-byte aligned as well)
fldws -8(0,%r29),%fr9L ;
xmpyu %fr8L,%fr9L,%fr9 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
fstds %fr9,-16(0,%r30) ; two 32-bit mutiplies can be issued per cycle.
copy %r28,%r21 ;
ldi 0,%r20 bn_mul_words_unroll2
ldw -16(0,%r30),%r24
ldw -12(0,%r30),%r25 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
add %r21,%r25,%r25 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
addc %r20,%r24,%r24 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
copy %r24,%r23 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
ldi 0,%r22
copy %r23,%r28 FSTD fm1,-16(%sp) ; -16(sp) = m1
addib,= -1,%r19,L$0027 FSTD fm1_1,-48(%sp) ; -48(sp) = m1
stw %r25,-8(0,%r31) XMPYU flt_0,fw_h,fm ; m = lt*fw_h
fldws -4(0,%r29),%fr9L XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30) FSTD fm,-8(%sp) ; -8(sp) = m
copy %r28,%r21 FSTD fm_1,-40(%sp) ; -40(sp) = m
ldi 0,%r20 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
ldw -16(0,%r30),%r24 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
ldw -12(0,%r30),%r25
add %r21,%r25,%r25 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
addc %r20,%r24,%r24 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
copy %r24,%r23 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
ldi 0,%r22 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
copy %r23,%r28
addib,= -1,%r19,L$0027 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
stw %r25,-4(0,%r31) FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
fldws 0(0,%r29),%fr9L LDD -8(%sp),m_0
xmpyu %fr8L,%fr9L,%fr9 LDD -40(%sp),m_1
fstds %fr9,-16(0,%r30)
copy %r28,%r21 LDD -16(%sp),m1_0
ldi 0,%r20 LDD -48(%sp),m1_1
ldw -16(0,%r30),%r24 LDD -24(%sp),ht_0
ldw -12(0,%r30),%r25 LDD -56(%sp),ht_1
add %r21,%r25,%r25
addc %r20,%r24,%r24 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
copy %r24,%r23 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
ldi 0,%r22 LDD -32(%sp),lt_0
copy %r23,%r28 LDD -64(%sp),lt_1
addib,= -1,%r19,L$0027
stw %r25,0(0,%r31) CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
ldo 16(%r29),%r29 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
ldo 16(%r2),%r2 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
ldo 16(%r31),%r31 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
bl L$0026,0
ldo 16(%r26),%r26 EXTRD,U tmp_0,31,32,m_0 ; m>>32
L$0027 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
ldw -84(0,%r30),%r2 EXTRD,U tmp_1,31,32,m_1 ; m>>32
bv 0(%r2) DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
ldwm -64(0,%r30),%r4
.EXIT ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
.PROCEND ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
.align 4 ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
.EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR ADD,DC ht_0,%r0,ht_0 ; ht++
ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
ADD,DC ht_1,%r0,ht_1 ; ht++
ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1);
ADD,DC ht_0,%r0,ht_0 ; ht++
ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
ADD,DC ht_1,%r0,ht_1 ; ht++
STD lt_0,0(r_ptr) ; rp[0] = lt
STD lt_1,8(r_ptr) ; rp[1] = lt
COPY ht_1,%ret1 ; carry = ht
LDO -2(num),num ; num = num - 2;
LDO 16(a_ptr),a_ptr ; ap += 2
CMPIB,<= 2,num,bn_mul_words_unroll2
LDO 16(r_ptr),r_ptr ; rp++
CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
;
; Top of loop aligned on 64-byte boundary
;
bn_mul_words_single_top
FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
FSTD fm1,-16(%sp) ; -16(sp) = m1
XMPYU flt_0,fw_h,fm ; m = lt*fw_h
FSTD fm,-8(%sp) ; -8(sp) = m
XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
FSTD ht_temp,-24(%sp) ; -24(sp) = ht
XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
FSTD lt_temp,-32(%sp) ; -32(sp) = lt
LDD -8(%sp),m_0
LDD -16(%sp),m1_0
ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
LDD -24(%sp),ht_0
LDD -32(%sp),lt_0
CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
EXTRD,U tmp_0,31,32,m_0 ; m>>32
DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
ADD,DC ht_0,%r0,ht_0 ; ht++
ADD %ret1,lt_0,lt_0 ; lt = lt + c;
ADD,DC ht_0,%r0,ht_0 ; ht++
COPY ht_0,%ret1 ; copy carry
STD lt_0,0(r_ptr) ; rp[0] = lt
bn_mul_words_exit
.EXIT
EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
LDD -96(%sp),%r7 ; restore r7
LDD -104(%sp),%r6 ; restore r6
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3 ; restore r3
.PROCEND
;----------------------------------------------------------------------------
;
;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
;
; arg0 = rp
; arg1 = ap
; arg2 = num
;
bn_sqr_words bn_sqr_words
.proc
.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
.EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.entry
.align 64
STD %r3,0(%sp) ; save r3
STD %r4,8(%sp) ; save r4
NOP
STD %r5,16(%sp) ; save r5
CMPIB,>= 0,num,bn_sqr_words_exit
LDO 128(%sp),%sp ; bump stack
;
; If only 1, the goto straight to cleanup
;
CMPIB,= 1,num,bn_sqr_words_single_top
DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
;
; This loop is unrolled 2 times (64-byte aligned as well)
;
bn_sqr_words_unroll2
FLDD 0(a_ptr),t_float_0 ; a[0]
FLDD 8(a_ptr),t_float_1 ; a[1]
XMPYU fht_0,flt_0,fm ; m[0]
XMPYU fht_1,flt_1,fm_1 ; m[1]
FSTD fm,-24(%sp) ; store m[0]
FSTD fm_1,-56(%sp) ; store m[1]
XMPYU flt_0,flt_0,lt_temp ; lt[0]
XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
FSTD lt_temp,-16(%sp) ; store lt[0]
FSTD lt_temp_1,-48(%sp) ; store lt[1]
XMPYU fht_0,fht_0,ht_temp ; ht[0]
XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
FSTD ht_temp,-8(%sp) ; store ht[0]
FSTD ht_temp_1,-40(%sp) ; store ht[1]
LDD -24(%sp),m_0
LDD -56(%sp),m_1
AND m_0,high_mask,tmp_0 ; m[0] & Mask
AND m_1,high_mask,tmp_1 ; m[1] & Mask
DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
LDD -16(%sp),lt_0
LDD -48(%sp),lt_1
EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
LDD -8(%sp),ht_0
LDD -40(%sp),ht_1
ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
ADD lt_0,m_0,lt_0 ; lt = lt+m
ADD,DC ht_0,%r0,ht_0 ; ht[0]++
STD lt_0,0(r_ptr) ; rp[0] = lt[0]
STD ht_0,8(r_ptr) ; rp[1] = ht[1]
ADD lt_1,m_1,lt_1 ; lt = lt+m
ADD,DC ht_1,%r0,ht_1 ; ht[1]++
STD lt_1,16(r_ptr) ; rp[2] = lt[1]
STD ht_1,24(r_ptr) ; rp[3] = ht[1]
LDO -2(num),num ; num = num - 2;
LDO 16(a_ptr),a_ptr ; ap += 2
CMPIB,<= 2,num,bn_sqr_words_unroll2
LDO 32(r_ptr),r_ptr ; rp += 4
CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
;
; Top of loop aligned on 64-byte boundary
;
bn_sqr_words_single_top
FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
XMPYU fht_0,flt_0,fm ; m
FSTD fm,-24(%sp) ; store m
XMPYU flt_0,flt_0,lt_temp ; lt
FSTD lt_temp,-16(%sp) ; store lt
XMPYU fht_0,fht_0,ht_temp ; ht
FSTD ht_temp,-8(%sp) ; store ht
LDD -24(%sp),m_0 ; load m
AND m_0,high_mask,tmp_0 ; m & Mask
DEPD,Z m_0,30,31,m_0 ; m << 32+1
LDD -16(%sp),lt_0 ; lt
LDD -8(%sp),ht_0 ; ht
EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
ADD m_0,lt_0,lt_0 ; lt = lt+m
ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
ADD,DC ht_0,%r0,ht_0 ; ht++
STD lt_0,0(r_ptr) ; rp[0] = lt
STD ht_0,8(r_ptr) ; rp[1] = ht
bn_sqr_words_exit
.EXIT
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3
.PROCEND ;in=23,24,25,26,29;out=28;
;----------------------------------------------------------------------------
;
;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
;
; arg0 = rp
; arg1 = ap
; arg2 = bp
; arg3 = n
t .reg %r22
b .reg %r21
l .reg %r20
bn_add_words
.proc
.entry
.callinfo
.EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.align 64
CMPIB,>= 0,n,bn_add_words_exit
COPY %r0,%ret1 ; return 0 by default
;
; If 2 or more numbers do the loop
;
CMPIB,= 1,n,bn_add_words_single_top
NOP
;
; This loop is unrolled 2 times (64-byte aligned as well)
;
bn_add_words_unroll2
LDD 0(a_ptr),t
LDD 0(b_ptr),b
ADD t,%ret1,t ; t = t+c;
ADD,DC %r0,%r0,%ret1 ; set c to carry
ADD t,b,l ; l = t + b[0]
ADD,DC %ret1,%r0,%ret1 ; c+= carry
STD l,0(r_ptr)
LDD 8(a_ptr),t
LDD 8(b_ptr),b
ADD t,%ret1,t ; t = t+c;
ADD,DC %r0,%r0,%ret1 ; set c to carry
ADD t,b,l ; l = t + b[0]
ADD,DC %ret1,%r0,%ret1 ; c+= carry
STD l,8(r_ptr)
LDO -2(n),n
LDO 16(a_ptr),a_ptr
LDO 16(b_ptr),b_ptr
CMPIB,<= 2,n,bn_add_words_unroll2
LDO 16(r_ptr),r_ptr
CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
bn_add_words_single_top
LDD 0(a_ptr),t
LDD 0(b_ptr),b
ADD t,%ret1,t ; t = t+c;
ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??)
ADD t,b,l ; l = t + b[0]
ADD,DC %ret1,%r0,%ret1 ; c+= carry
STD l,0(r_ptr)
bn_add_words_exit
.EXIT
BVE (%rp)
EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
.PROCEND ;in=23,24,25,26,29;out=28;
;----------------------------------------------------------------------------
;
;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
;
; arg0 = rp
; arg1 = ap
; arg2 = bp
; arg3 = n
t1 .reg %r22
t2 .reg %r21
sub_tmp1 .reg %r20
sub_tmp2 .reg %r19
bn_sub_words
.proc
.callinfo
.EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.entry
.align 64
CMPIB,>= 0,n,bn_sub_words_exit
COPY %r0,%ret1 ; return 0 by default
;
; If 2 or more numbers do the loop
;
CMPIB,= 1,n,bn_sub_words_single_top
NOP
;
; This loop is unrolled 2 times (64-byte aligned as well)
;
bn_sub_words_unroll2
LDD 0(a_ptr),t1
LDD 0(b_ptr),t2
SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
LDO 1(%r0),sub_tmp2
CMPCLR,*= t1,t2,%r0
COPY sub_tmp2,%ret1
STD sub_tmp1,0(r_ptr)
LDD 8(a_ptr),t1
LDD 8(b_ptr),t2
SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
LDO 1(%r0),sub_tmp2
CMPCLR,*= t1,t2,%r0
COPY sub_tmp2,%ret1
STD sub_tmp1,8(r_ptr)
LDO -2(n),n
LDO 16(a_ptr),a_ptr
LDO 16(b_ptr),b_ptr
CMPIB,<= 2,n,bn_sub_words_unroll2
LDO 16(r_ptr),r_ptr
CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
bn_sub_words_single_top
LDD 0(a_ptr),t1
LDD 0(b_ptr),t2
SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
LDO 1(%r0),sub_tmp2
CMPCLR,*= t1,t2,%r0
COPY sub_tmp2,%ret1
STD sub_tmp1,0(r_ptr)
bn_sub_words_exit
.EXIT
BVE (%rp)
EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
.PROCEND ;in=23,24,25,26,29;out=28;
;------------------------------------------------------------------------------
;
; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
;
; arg0 = h
; arg1 = l
; arg2 = d
;
; This is mainly just output from the HP C compiler.
;
;------------------------------------------------------------------------------
bn_div_words
.PROC .PROC
.CALLINFO FRAME=0,NO_CALLS .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
.ENTRY .IMPORT BN_num_bits_word,CODE
ldo 28(%r26),%r19 .IMPORT __iob,DATA
ldo 12(%r25),%r28 .IMPORT fprintf,CODE
L$0042 .IMPORT abort,CODE
fldws 0(0,%r25),%fr8L .IMPORT $$div2U,MILLICODE
fldws 0(0,%r25),%fr8R .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
xmpyu %fr8L,%fr8R,%fr8 .ENTRY
fstds %fr8,-16(0,%r30) STW %r2,-20(%r30) ;offset 0x8ec
ldw -16(0,%r30),%r22 STW,MA %r3,192(%r30) ;offset 0x8f0
ldw -12(0,%r30),%r23 STW %r4,-188(%r30) ;offset 0x8f4
stw %r23,0(0,%r26) DEPD %r5,31,32,%r6 ;offset 0x8f8
copy %r22,%r21 STD %r6,-184(%r30) ;offset 0x8fc
ldi 0,%r20 DEPD %r7,31,32,%r8 ;offset 0x900
addib,= -1,%r24,L$0049 STD %r8,-176(%r30) ;offset 0x904
stw %r21,-24(0,%r19) STW %r9,-168(%r30) ;offset 0x908
fldws -8(0,%r28),%fr8L LDD -248(%r30),%r3 ;offset 0x90c
fldws -8(0,%r28),%fr8R COPY %r26,%r4 ;offset 0x910
xmpyu %fr8L,%fr8R,%fr8 COPY %r24,%r5 ;offset 0x914
fstds %fr8,-16(0,%r30) DEPD %r25,31,32,%r4 ;offset 0x918
ldw -16(0,%r30),%r22 CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c
ldw -12(0,%r30),%r23 DEPD %r23,31,32,%r5 ;offset 0x920
stw %r23,-20(0,%r19) MOVIB,TR -1,%r29,$00060002 ;offset 0x924
copy %r22,%r21 EXTRD,U %r29,31,32,%r28 ;offset 0x928
ldi 0,%r20 $0006002A
addib,= -1,%r24,L$0049 LDO -1(%r29),%r29 ;offset 0x92c
stw %r21,-16(0,%r19) SUB %r23,%r7,%r23 ;offset 0x930
fldws -4(0,%r28),%fr8L $00060024
fldws -4(0,%r28),%fr8R SUB %r4,%r31,%r25 ;offset 0x934
xmpyu %fr8L,%fr8R,%fr8 AND %r25,%r19,%r26 ;offset 0x938
fstds %fr8,-16(0,%r30) CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c
ldw -16(0,%r30),%r22 DEPD,Z %r25,31,32,%r20 ;offset 0x940
ldw -12(0,%r30),%r23 OR %r20,%r24,%r21 ;offset 0x944
stw %r23,-12(0,%r19) CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948
copy %r22,%r21 SUB %r31,%r2,%r31 ;offset 0x94c
ldi 0,%r20 $00060046
addib,= -1,%r24,L$0049 $0006002E
stw %r21,-8(0,%r19) DEPD,Z %r23,31,32,%r25 ;offset 0x950
fldws 0(0,%r28),%fr8L EXTRD,U %r23,31,32,%r26 ;offset 0x954
fldws 0(0,%r28),%fr8R AND %r25,%r19,%r24 ;offset 0x958
xmpyu %fr8L,%fr8R,%fr8 ADD,L %r31,%r26,%r31 ;offset 0x95c
fstds %fr8,-16(0,%r30) CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960
ldw -16(0,%r30),%r22 LDO 1(%r31),%r31 ;offset 0x964
ldw -12(0,%r30),%r23 $00060032
stw %r23,-4(0,%r19) CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968
copy %r22,%r21 LDO -1(%r29),%r29 ;offset 0x96c
ldi 0,%r20 ADD,L %r4,%r3,%r4 ;offset 0x970
addib,= -1,%r24,L$0049 $00060036
stw %r21,0(0,%r19) ADDIB,=,N -1,%r8,$D0 ;offset 0x974
ldo 16(%r28),%r28 SUB %r5,%r24,%r28 ;offset 0x978
ldo 16(%r25),%r25 $0006003A
ldo 32(%r19),%r19 SUB %r4,%r31,%r24 ;offset 0x97c
bl L$0042,0 SHRPD %r24,%r28,32,%r4 ;offset 0x980
ldo 32(%r26),%r26 DEPD,Z %r29,31,32,%r9 ;offset 0x984
L$0049 DEPD,Z %r28,31,32,%r5 ;offset 0x988
bv,n 0(%r2) $0006001C
.EXIT EXTRD,U %r4,31,32,%r31 ;offset 0x98c
.PROCEND CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990
.IMPORT BN_num_bits_word,CODE MOVB,TR %r6,%r29,$D1 ;offset 0x994
.IMPORT fprintf,CODE STD %r29,-152(%r30) ;offset 0x998
.IMPORT __iob,DATA $0006000C
.SPACE $TEXT$ EXTRD,U %r3,31,32,%r25 ;offset 0x99c
.SUBSPA $LIT$ COPY %r3,%r26 ;offset 0x9a0
EXTRD,U %r3,31,32,%r9 ;offset 0x9a4
.align 4 EXTRD,U %r4,31,32,%r8 ;offset 0x9a8
L$C0000 .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28;
.STRING "Division would overflow (%d)\x0a\x00" B,L BN_num_bits_word,%r2 ;offset 0x9ac
.IMPORT abort,CODE EXTRD,U %r5,31,32,%r7 ;offset 0x9b0
.SPACE $TEXT$ LDI 64,%r20 ;offset 0x9b4
.SUBSPA $CODE$ DEPD %r7,31,32,%r5 ;offset 0x9b8
DEPD %r8,31,32,%r4 ;offset 0x9bc
.align 4 DEPD %r9,31,32,%r3 ;offset 0x9c0
.EXPORT bn_div64,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR CMPB,= %r28,%r20,$00060012 ;offset 0x9c4
bn_div64 COPY %r28,%r24 ;offset 0x9c8
MTSARCM %r24 ;offset 0x9cc
DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0
CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4
$00060012
SUBI 64,%r24,%r31 ;offset 0x9d8
CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc
SUB %r4,%r3,%r4 ;offset 0x9e0
$00060016
CMPB,= %r31,%r0,$0006001A ;offset 0x9e4
COPY %r0,%r9 ;offset 0x9e8
MTSARCM %r31 ;offset 0x9ec
DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0
SUBI 64,%r31,%r26 ;offset 0x9f4
MTSAR %r26 ;offset 0x9f8
SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc
MTSARCM %r31 ;offset 0xa00
DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04
$0006001A
DEPDI,Z -1,31,32,%r19 ;offset 0xa08
AND %r3,%r19,%r29 ;offset 0xa0c
EXTRD,U %r29,31,32,%r2 ;offset 0xa10
DEPDI,Z -1,63,32,%r6 ;offset 0xa14
MOVIB,TR 2,%r8,$0006001C ;offset 0xa18
EXTRD,U %r3,63,32,%r7 ;offset 0xa1c
$D2
ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20
LDIL LR'C$7,%r21 ;offset 0xa24
LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28
.CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28;
B,L fprintf,%r2 ;offset 0xa2c
LDO RR'C$7(%r21),%r25 ;offset 0xa30
.CALL ;
B,L abort,%r2 ;offset 0xa34
NOP ;offset 0xa38
B $D3 ;offset 0xa3c
LDW -212(%r30),%r2 ;offset 0xa40
$00060020
COPY %r4,%r26 ;offset 0xa44
EXTRD,U %r4,31,32,%r25 ;offset 0xa48
COPY %r2,%r24 ;offset 0xa4c
.CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
B,L $$div2U,%r31 ;offset 0xa50
EXTRD,U %r2,31,32,%r23 ;offset 0xa54
DEPD %r28,31,32,%r29 ;offset 0xa58
$00060022
STD %r29,-152(%r30) ;offset 0xa5c
$D1
AND %r5,%r19,%r24 ;offset 0xa60
EXTRD,U %r24,31,32,%r24 ;offset 0xa64
STW %r2,-160(%r30) ;offset 0xa68
STW %r7,-128(%r30) ;offset 0xa6c
FLDD -152(%r30),%fr4 ;offset 0xa70
FLDD -152(%r30),%fr7 ;offset 0xa74
FLDW -160(%r30),%fr8L ;offset 0xa78
FLDW -128(%r30),%fr5L ;offset 0xa7c
XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80
FSTD %fr10,-136(%r30) ;offset 0xa84
XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88
FSTD %fr22,-144(%r30) ;offset 0xa8c
XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90
XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94
FSTD %fr11,-112(%r30) ;offset 0xa98
FSTD %fr23,-120(%r30) ;offset 0xa9c
LDD -136(%r30),%r28 ;offset 0xaa0
DEPD,Z %r28,31,32,%r31 ;offset 0xaa4
LDD -144(%r30),%r20 ;offset 0xaa8
ADD,L %r20,%r31,%r31 ;offset 0xaac
LDD -112(%r30),%r22 ;offset 0xab0
DEPD,Z %r22,31,32,%r22 ;offset 0xab4
LDD -120(%r30),%r21 ;offset 0xab8
B $00060024 ;offset 0xabc
ADD,L %r21,%r22,%r23 ;offset 0xac0
$D0
OR %r9,%r29,%r29 ;offset 0xac4
$00060040
EXTRD,U %r29,31,32,%r28 ;offset 0xac8
$00060002
$L2
LDW -212(%r30),%r2 ;offset 0xacc
$D3
LDW -168(%r30),%r9 ;offset 0xad0
LDD -176(%r30),%r8 ;offset 0xad4
EXTRD,U %r8,31,32,%r7 ;offset 0xad8
LDD -184(%r30),%r6 ;offset 0xadc
EXTRD,U %r6,31,32,%r5 ;offset 0xae0
LDW -188(%r30),%r4 ;offset 0xae4
BVE (%r2) ;offset 0xae8
.EXIT
LDW,MB -192(%r30),%r3 ;offset 0xaec
.PROCEND ;in=23,25;out=28,29;fpin=105,107;
;----------------------------------------------------------------------------
;
; Registers to hold 64-bit values to manipulate. The "L" part
; of the register corresponds to the upper 32-bits, while the "R"
; part corresponds to the lower 32-bits
;
; Note, that when using b6 and b7, the code must save these before
; using them because they are callee save registers
;
;
; Floating point registers to use to save values that
; are manipulated. These don't collide with ftemp1-6 and
; are all caller save registers
;
a0 .reg %fr22
a0L .reg %fr22L
a0R .reg %fr22R
a1 .reg %fr23
a1L .reg %fr23L
a1R .reg %fr23R
a2 .reg %fr24
a2L .reg %fr24L
a2R .reg %fr24R
a3 .reg %fr25
a3L .reg %fr25L
a3R .reg %fr25R
a4 .reg %fr26
a4L .reg %fr26L
a4R .reg %fr26R
a5 .reg %fr27
a5L .reg %fr27L
a5R .reg %fr27R
a6 .reg %fr28
a6L .reg %fr28L
a6R .reg %fr28R
a7 .reg %fr29
a7L .reg %fr29L
a7R .reg %fr29R
b0 .reg %fr30
b0L .reg %fr30L
b0R .reg %fr30R
b1 .reg %fr31
b1L .reg %fr31L
b1R .reg %fr31R
;
; Temporary floating point variables, these are all caller save
; registers
;
ftemp1 .reg %fr4
ftemp2 .reg %fr5
ftemp3 .reg %fr6
ftemp4 .reg %fr7
;
; The B set of registers when used.
;
b2 .reg %fr8
b2L .reg %fr8L
b2R .reg %fr8R
b3 .reg %fr9
b3L .reg %fr9L
b3R .reg %fr9R
b4 .reg %fr10
b4L .reg %fr10L
b4R .reg %fr10R
b5 .reg %fr11
b5L .reg %fr11L
b5R .reg %fr11R
b6 .reg %fr12
b6L .reg %fr12L
b6R .reg %fr12R
b7 .reg %fr13
b7L .reg %fr13L
b7R .reg %fr13R
c1 .reg %r21 ; only reg
temp1 .reg %r20 ; only reg
temp2 .reg %r19 ; only reg
temp3 .reg %r31 ; only reg
m1 .reg %r28
c2 .reg %r23
high_one .reg %r1
ht .reg %r6
lt .reg %r5
m .reg %r4
c3 .reg %r3
SQR_ADD_C .macro A0L,A0R,C1,C2,C3
XMPYU A0L,A0R,ftemp1 ; m
FSTD ftemp1,-24(%sp) ; store m
XMPYU A0R,A0R,ftemp2 ; lt
FSTD ftemp2,-16(%sp) ; store lt
XMPYU A0L,A0L,ftemp3 ; ht
FSTD ftemp3,-8(%sp) ; store ht
LDD -24(%sp),m ; load m
AND m,high_mask,temp2 ; m & Mask
DEPD,Z m,30,31,temp3 ; m << 32+1
LDD -16(%sp),lt ; lt
LDD -8(%sp),ht ; ht
EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
ADD temp3,lt,lt ; lt = lt+m
ADD,L ht,temp1,ht ; ht += temp1
ADD,DC ht,%r0,ht ; ht++
ADD C1,lt,C1 ; c1=c1+lt
ADD,DC ht,%r0,ht ; ht++
ADD C2,ht,C2 ; c2=c2+ht
ADD,DC C3,%r0,C3 ; c3++
.endm
SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
FSTD ftemp1,-16(%sp) ;
XMPYU A0R,A1L,ftemp2 ; m = bh*lt
FSTD ftemp2,-8(%sp) ;
XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
FSTD ftemp3,-32(%sp)
XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
FSTD ftemp4,-24(%sp) ;
LDD -8(%sp),m ; r21 = m
LDD -16(%sp),m1 ; r19 = m1
ADD,L m,m1,m ; m+m1
DEPD,Z m,31,32,temp3 ; (m+m1<<32)
LDD -24(%sp),ht ; r24 = ht
CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
ADD,L ht,high_one,ht ; ht+=high_one
EXTRD,U m,31,32,temp1 ; m >> 32
LDD -32(%sp),lt ; lt
ADD,L ht,temp1,ht ; ht+= m>>32
ADD lt,temp3,lt ; lt = lt+m1
ADD,DC ht,%r0,ht ; ht++
ADD ht,ht,ht ; ht=ht+ht;
ADD,DC C3,%r0,C3 ; add in carry (c3++)
ADD lt,lt,lt ; lt=lt+lt;
ADD,DC ht,%r0,ht ; add in carry (ht++)
ADD C1,lt,C1 ; c1=c1+lt
ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
ADD C2,ht,C2 ; c2 = c2 + ht
ADD,DC C3,%r0,C3 ; add in carry (c3++)
.endm
;
;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
; arg0 = r_ptr
; arg1 = a_ptr
;
bn_sqr_comba8
.PROC .PROC
.CALLINFO FRAME=128,CALLS,SAVE_RP,ENTRY_GR=8 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
.ENTRY .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
stw %r2,-20(0,%r30) .ENTRY
stwm %r8,128(0,%r30) .align 64
stw %r7,-124(0,%r30)
stw %r4,-112(0,%r30) STD %r3,0(%sp) ; save r3
stw %r3,-108(0,%r30) STD %r4,8(%sp) ; save r4
copy %r26,%r3 STD %r5,16(%sp) ; save r5
copy %r25,%r4 STD %r6,24(%sp) ; save r6
stw %r6,-120(0,%r30)
ldi 0,%r7 ;
stw %r5,-116(0,%r30) ; Zero out carries
movb,<> %r24,%r5,L$0051 ;
ldi 2,%r6 COPY %r0,c1
bl L$0068,0 COPY %r0,c2
ldi -1,%r28 COPY %r0,c3
L$0051
.CALL ARGW0=GR LDO 128(%sp),%sp ; bump stack
bl BN_num_bits_word,%r2 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
copy %r5,%r26 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
copy %r28,%r24
ldi 32,%r19 ;
comb,= %r19,%r24,L$0052 ; Load up all of the values we are going to use
subi 31,%r24,%r19 ;
mtsar %r19 FLDD 0(a_ptr),a0
zvdepi 1,32,%r19 FLDD 8(a_ptr),a1
comb,>>= %r19,%r3,L$0052 FLDD 16(a_ptr),a2
addil LR'__iob-$global$+32,%r27 FLDD 24(a_ptr),a3
ldo RR'__iob-$global$+32(%r1),%r26 FLDD 32(a_ptr),a4
ldil LR'L$C0000,%r25 FLDD 40(a_ptr),a5
.CALL ARGW0=GR,ARGW1=GR,ARGW2=GR FLDD 48(a_ptr),a6
bl fprintf,%r2 FLDD 56(a_ptr),a7
ldo RR'L$C0000(%r25),%r25
.CALL SQR_ADD_C a0L,a0R,c1,c2,c3
bl abort,%r2 STD c1,0(r_ptr) ; r[0] = c1;
nop COPY %r0,c1
L$0052
comb,>> %r5,%r3,L$0053 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
subi 32,%r24,%r24 STD c2,8(r_ptr) ; r[1] = c2;
sub %r3,%r5,%r3 COPY %r0,c2
L$0053
comib,= 0,%r24,L$0054 SQR_ADD_C a1L,a1R,c3,c1,c2
subi 31,%r24,%r19 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
mtsar %r19 STD c3,16(r_ptr) ; r[2] = c3;
zvdep %r5,32,%r5 COPY %r0,c3
zvdep %r3,32,%r21
subi 32,%r24,%r20 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
mtsar %r20 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
vshd 0,%r4,%r20 STD c1,24(r_ptr) ; r[3] = c1;
or %r21,%r20,%r3 COPY %r0,c1
mtsar %r19
zvdep %r4,32,%r4 SQR_ADD_C a2L,a2R,c2,c3,c1
L$0054 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
extru %r5,15,16,%r23 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
extru %r5,31,16,%r28 STD c2,32(r_ptr) ; r[4] = c2;
L$0055 COPY %r0,c2
extru %r3,15,16,%r19
comb,<> %r23,%r19,L$0058 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
copy %r3,%r26 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
bl L$0059,0 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
zdepi -1,31,16,%r29 STD c3,40(r_ptr) ; r[5] = c3;
L$0058 COPY %r0,c3
.IMPORT $$divU,MILLICODE
bl $$divU,%r31 SQR_ADD_C a3L,a3R,c1,c2,c3
copy %r23,%r25 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
L$0059 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
stw %r29,-16(0,%r30) SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
fldws -16(0,%r30),%fr10L STD c1,48(r_ptr) ; r[6] = c1;
stw %r28,-16(0,%r30) COPY %r0,c1
fldws -16(0,%r30),%fr10R
stw %r23,-16(0,%r30) SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
xmpyu %fr10L,%fr10R,%fr8 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
fldws -16(0,%r30),%fr10R SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
fstws %fr8R,-16(0,%r30) SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
xmpyu %fr10L,%fr10R,%fr9 STD c2,56(r_ptr) ; r[7] = c2;
ldw -16(0,%r30),%r8 COPY %r0,c2
fstws %fr9R,-16(0,%r30)
copy %r8,%r22 SQR_ADD_C a4L,a4R,c3,c1,c2
ldw -16(0,%r30),%r8 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
extru %r4,15,16,%r24 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
copy %r8,%r21 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
L$0060 STD c3,64(r_ptr) ; r[8] = c3;
sub %r3,%r21,%r20 COPY %r0,c3
copy %r20,%r19
depi 0,31,16,%r19 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
comib,<> 0,%r19,L$0061 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
zdep %r20,15,16,%r19 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
addl %r19,%r24,%r19 STD c1,72(r_ptr) ; r[9] = c1;
comb,>>= %r19,%r22,L$0061 COPY %r0,c1
sub %r22,%r28,%r22
sub %r21,%r23,%r21 SQR_ADD_C a5L,a5R,c2,c3,c1
bl L$0060,0 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
ldo -1(%r29),%r29 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
L$0061 STD c2,80(r_ptr) ; r[10] = c2;
stw %r29,-16(0,%r30) COPY %r0,c2
fldws -16(0,%r30),%fr10L
stw %r28,-16(0,%r30) SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
fldws -16(0,%r30),%fr10R SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
xmpyu %fr10L,%fr10R,%fr8 STD c3,88(r_ptr) ; r[11] = c3;
fstws %fr8R,-16(0,%r30) COPY %r0,c3
ldw -16(0,%r30),%r8
stw %r23,-16(0,%r30) SQR_ADD_C a6L,a6R,c1,c2,c3
fldws -16(0,%r30),%fr10R SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
copy %r8,%r19 STD c1,96(r_ptr) ; r[12] = c1;
xmpyu %fr10L,%fr10R,%fr8 COPY %r0,c1
fstws %fr8R,-16(0,%r30)
extru %r19,15,16,%r20 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
ldw -16(0,%r30),%r8 STD c2,104(r_ptr) ; r[13] = c2;
zdep %r19,15,16,%r19 COPY %r0,c2
addl %r8,%r20,%r20
comclr,<<= %r19,%r4,0 SQR_ADD_C a7L,a7R,c3,c1,c2
addi 1,%r20,%r20 STD c3, 112(r_ptr) ; r[14] = c3
comb,<<= %r20,%r3,L$0066 STD c1, 120(r_ptr) ; r[15] = c1
sub %r4,%r19,%r4
addl %r3,%r5,%r3 .EXIT
ldo -1(%r29),%r29 LDD -104(%sp),%r6 ; restore r6
L$0066 LDD -112(%sp),%r5 ; restore r5
addib,= -1,%r6,L$0056 LDD -120(%sp),%r4 ; restore r4
sub %r3,%r20,%r3 BVE (%rp)
zdep %r29,15,16,%r7 LDD,MB -128(%sp),%r3
shd %r3,%r4,16,%r3
bl L$0055,0 .PROCEND
zdep %r4,15,16,%r4
L$0056 ;-----------------------------------------------------------------------------
or %r7,%r29,%r28 ;
L$0068 ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
ldw -148(0,%r30),%r2 ; arg0 = r_ptr
ldw -124(0,%r30),%r7 ; arg1 = a_ptr
ldw -120(0,%r30),%r6 ;
ldw -116(0,%r30),%r5
ldw -112(0,%r30),%r4 bn_sqr_comba4
ldw -108(0,%r30),%r3 .proc
bv 0(%r2) .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
ldwm -128(0,%r30),%r8 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.EXIT .entry
.PROCEND .align 64
STD %r3,0(%sp) ; save r3
STD %r4,8(%sp) ; save r4
STD %r5,16(%sp) ; save r5
STD %r6,24(%sp) ; save r6
;
; Zero out carries
;
COPY %r0,c1
COPY %r0,c2
COPY %r0,c3
LDO 128(%sp),%sp ; bump stack
DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
;
; Load up all of the values we are going to use
;
FLDD 0(a_ptr),a0
FLDD 8(a_ptr),a1
FLDD 16(a_ptr),a2
FLDD 24(a_ptr),a3
FLDD 32(a_ptr),a4
FLDD 40(a_ptr),a5
FLDD 48(a_ptr),a6
FLDD 56(a_ptr),a7
SQR_ADD_C a0L,a0R,c1,c2,c3
STD c1,0(r_ptr) ; r[0] = c1;
COPY %r0,c1
SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
STD c2,8(r_ptr) ; r[1] = c2;
COPY %r0,c2
SQR_ADD_C a1L,a1R,c3,c1,c2
SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
STD c3,16(r_ptr) ; r[2] = c3;
COPY %r0,c3
SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
STD c1,24(r_ptr) ; r[3] = c1;
COPY %r0,c1
SQR_ADD_C a2L,a2R,c2,c3,c1
SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
STD c2,32(r_ptr) ; r[4] = c2;
COPY %r0,c2
SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
STD c3,40(r_ptr) ; r[5] = c3;
COPY %r0,c3
SQR_ADD_C a3L,a3R,c1,c2,c3
STD c1,48(r_ptr) ; r[6] = c1;
STD c2,56(r_ptr) ; r[7] = c2;
.EXIT
LDD -104(%sp),%r6 ; restore r6
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3
.PROCEND
;---------------------------------------------------------------------------
MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
FSTD ftemp1,-16(%sp) ;
XMPYU A0R,B0L,ftemp2 ; m = bh*lt
FSTD ftemp2,-8(%sp) ;
XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
FSTD ftemp3,-32(%sp)
XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
FSTD ftemp4,-24(%sp) ;
LDD -8(%sp),m ; r21 = m
LDD -16(%sp),m1 ; r19 = m1
ADD,L m,m1,m ; m+m1
DEPD,Z m,31,32,temp3 ; (m+m1<<32)
LDD -24(%sp),ht ; r24 = ht
CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
ADD,L ht,high_one,ht ; ht+=high_one
EXTRD,U m,31,32,temp1 ; m >> 32
LDD -32(%sp),lt ; lt
ADD,L ht,temp1,ht ; ht+= m>>32
ADD lt,temp3,lt ; lt = lt+m1
ADD,DC ht,%r0,ht ; ht++
ADD C1,lt,C1 ; c1=c1+lt
ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
ADD C2,ht,C2 ; c2 = c2 + ht
ADD,DC C3,%r0,C3 ; add in carry (c3++)
.endm
;
;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
; arg0 = r_ptr
; arg1 = a_ptr
; arg2 = b_ptr
;
bn_mul_comba8
.proc
.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
.EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.entry
.align 64
STD %r3,0(%sp) ; save r3
STD %r4,8(%sp) ; save r4
STD %r5,16(%sp) ; save r5
STD %r6,24(%sp) ; save r6
FSTD %fr12,32(%sp) ; save r6
FSTD %fr13,40(%sp) ; save r7
;
; Zero out carries
;
COPY %r0,c1
COPY %r0,c2
COPY %r0,c3
LDO 128(%sp),%sp ; bump stack
DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
;
; Load up all of the values we are going to use
;
FLDD 0(a_ptr),a0
FLDD 8(a_ptr),a1
FLDD 16(a_ptr),a2
FLDD 24(a_ptr),a3
FLDD 32(a_ptr),a4
FLDD 40(a_ptr),a5
FLDD 48(a_ptr),a6
FLDD 56(a_ptr),a7
FLDD 0(b_ptr),b0
FLDD 8(b_ptr),b1
FLDD 16(b_ptr),b2
FLDD 24(b_ptr),b3
FLDD 32(b_ptr),b4
FLDD 40(b_ptr),b5
FLDD 48(b_ptr),b6
FLDD 56(b_ptr),b7
MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
STD c1,0(r_ptr)
COPY %r0,c1
MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
STD c2,8(r_ptr)
COPY %r0,c2
MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
STD c3,16(r_ptr)
COPY %r0,c3
MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
STD c1,24(r_ptr)
COPY %r0,c1
MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
STD c2,32(r_ptr)
COPY %r0,c2
MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
STD c3,40(r_ptr)
COPY %r0,c3
MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
STD c1,48(r_ptr)
COPY %r0,c1
MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
STD c2,56(r_ptr)
COPY %r0,c2
MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
STD c3,64(r_ptr)
COPY %r0,c3
MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
STD c1,72(r_ptr)
COPY %r0,c1
MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
STD c2,80(r_ptr)
COPY %r0,c2
MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
STD c3,88(r_ptr)
COPY %r0,c3
MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
STD c1,96(r_ptr)
COPY %r0,c1
MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
STD c2,104(r_ptr)
COPY %r0,c2
MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
STD c3,112(r_ptr)
STD c1,120(r_ptr)
.EXIT
FLDD -88(%sp),%fr13
FLDD -96(%sp),%fr12
LDD -104(%sp),%r6 ; restore r6
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3
.PROCEND
;-----------------------------------------------------------------------------
;
;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
; arg0 = r_ptr
; arg1 = a_ptr
; arg2 = b_ptr
;
bn_mul_comba4
.proc
.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
.EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.entry
.align 64
STD %r3,0(%sp) ; save r3
STD %r4,8(%sp) ; save r4
STD %r5,16(%sp) ; save r5
STD %r6,24(%sp) ; save r6
FSTD %fr12,32(%sp) ; save r6
FSTD %fr13,40(%sp) ; save r7
;
; Zero out carries
;
COPY %r0,c1
COPY %r0,c2
COPY %r0,c3
LDO 128(%sp),%sp ; bump stack
DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
;
; Load up all of the values we are going to use
;
FLDD 0(a_ptr),a0
FLDD 8(a_ptr),a1
FLDD 16(a_ptr),a2
FLDD 24(a_ptr),a3
FLDD 0(b_ptr),b0
FLDD 8(b_ptr),b1
FLDD 16(b_ptr),b2
FLDD 24(b_ptr),b3
MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
STD c1,0(r_ptr)
COPY %r0,c1
MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
STD c2,8(r_ptr)
COPY %r0,c2
MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
STD c3,16(r_ptr)
COPY %r0,c3
MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
STD c1,24(r_ptr)
COPY %r0,c1
MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
STD c2,32(r_ptr)
COPY %r0,c2
MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
STD c3,40(r_ptr)
COPY %r0,c3
MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
STD c1,48(r_ptr)
STD c2,56(r_ptr)
.EXIT
FLDD -88(%sp),%fr13
FLDD -96(%sp),%fr12
LDD -104(%sp),%r6 ; restore r6
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3
.PROCEND
.SPACE $TEXT$
.SUBSPA $CODE$
.SPACE $PRIVATE$,SORT=16
.IMPORT $global$,DATA
.SPACE $TEXT$
.SUBSPA $CODE$
.SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=0x2c,SORT=16
C$7
.ALIGN 8
.STRINGZ "Division would overflow (%d)\n"
.END
.SPACE $PRIVATE$
.SUBSPA $DATA$,QUAD=1,ALIGN=8,ACCESS=31
.SUBSPA $BSS$,QUAD=1,ALIGN=8,ACCESS=31,ZERO,SORT=82
.SPACE $TEXT$
.SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=44
.SUBSPA $CODE$,QUAD=0,ALIGN=8,ACCESS=44,CODE_ONLY
.IMPORT $global$,DATA
.IMPORT $$dyncall,MILLICODE
; gcc_compiled.:
.SPACE $TEXT$
.SUBSPA $CODE$
.align 4
.EXPORT bn_mul_add_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR
bn_mul_add_words
.PROC
.CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=4
.ENTRY
stw %r2,-20(0,%r30)
stwm %r4,64(0,%r30)
copy %r24,%r31
stw %r3,-60(0,%r30)
ldi 0,%r20
ldo 12(%r26),%r2
stw %r23,-16(0,%r30)
copy %r25,%r3
ldo 12(%r3),%r1
fldws -16(0,%r30),%fr8L
L$0010
copy %r20,%r25
ldi 0,%r24
fldws 0(0,%r3),%fr9L
ldw 0(0,%r26),%r19
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r19,%r23
ldw -16(0,%r30),%r28
ldw -12(0,%r30),%r29
ldi 0,%r22
add %r23,%r29,%r29
addc %r22,%r28,%r28
add %r25,%r29,%r29
addc %r24,%r28,%r28
copy %r28,%r21
ldi 0,%r20
copy %r21,%r20
addib,= -1,%r31,L$0011
stw %r29,0(0,%r26)
copy %r20,%r25
ldi 0,%r24
fldws -8(0,%r1),%fr9L
ldw -8(0,%r2),%r19
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r19,%r23
ldw -16(0,%r30),%r28
ldw -12(0,%r30),%r29
ldi 0,%r22
add %r23,%r29,%r29
addc %r22,%r28,%r28
add %r25,%r29,%r29
addc %r24,%r28,%r28
copy %r28,%r21
ldi 0,%r20
copy %r21,%r20
addib,= -1,%r31,L$0011
stw %r29,-8(0,%r2)
copy %r20,%r25
ldi 0,%r24
fldws -4(0,%r1),%fr9L
ldw -4(0,%r2),%r19
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r19,%r23
ldw -16(0,%r30),%r28
ldw -12(0,%r30),%r29
ldi 0,%r22
add %r23,%r29,%r29
addc %r22,%r28,%r28
add %r25,%r29,%r29
addc %r24,%r28,%r28
copy %r28,%r21
ldi 0,%r20
copy %r21,%r20
addib,= -1,%r31,L$0011
stw %r29,-4(0,%r2)
copy %r20,%r25
ldi 0,%r24
fldws 0(0,%r1),%fr9L
ldw 0(0,%r2),%r19
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r19,%r23
ldw -16(0,%r30),%r28
ldw -12(0,%r30),%r29
ldi 0,%r22
add %r23,%r29,%r29
addc %r22,%r28,%r28
add %r25,%r29,%r29
addc %r24,%r28,%r28
copy %r28,%r21
ldi 0,%r20
copy %r21,%r20
addib,= -1,%r31,L$0011
stw %r29,0(0,%r2)
ldo 16(%r1),%r1
ldo 16(%r3),%r3
ldo 16(%r2),%r2
bl L$0010,0
ldo 16(%r26),%r26
L$0011
copy %r20,%r28
ldw -84(0,%r30),%r2
ldw -60(0,%r30),%r3
bv 0(%r2)
ldwm -64(0,%r30),%r4
.EXIT
.PROCEND
.align 4
.EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR
bn_mul_words
.PROC
.CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=3
.ENTRY
stw %r2,-20(0,%r30)
copy %r25,%r2
stwm %r4,64(0,%r30)
copy %r24,%r19
ldi 0,%r28
stw %r23,-16(0,%r30)
ldo 12(%r26),%r31
ldo 12(%r2),%r29
fldws -16(0,%r30),%fr8L
L$0026
fldws 0(0,%r2),%fr9L
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r28,%r21
ldi 0,%r20
ldw -16(0,%r30),%r24
ldw -12(0,%r30),%r25
add %r21,%r25,%r25
addc %r20,%r24,%r24
copy %r24,%r23
ldi 0,%r22
copy %r23,%r28
addib,= -1,%r19,L$0027
stw %r25,0(0,%r26)
fldws -8(0,%r29),%fr9L
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r28,%r21
ldi 0,%r20
ldw -16(0,%r30),%r24
ldw -12(0,%r30),%r25
add %r21,%r25,%r25
addc %r20,%r24,%r24
copy %r24,%r23
ldi 0,%r22
copy %r23,%r28
addib,= -1,%r19,L$0027
stw %r25,-8(0,%r31)
fldws -4(0,%r29),%fr9L
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r28,%r21
ldi 0,%r20
ldw -16(0,%r30),%r24
ldw -12(0,%r30),%r25
add %r21,%r25,%r25
addc %r20,%r24,%r24
copy %r24,%r23
ldi 0,%r22
copy %r23,%r28
addib,= -1,%r19,L$0027
stw %r25,-4(0,%r31)
fldws 0(0,%r29),%fr9L
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r28,%r21
ldi 0,%r20
ldw -16(0,%r30),%r24
ldw -12(0,%r30),%r25
add %r21,%r25,%r25
addc %r20,%r24,%r24
copy %r24,%r23
ldi 0,%r22
copy %r23,%r28
addib,= -1,%r19,L$0027
stw %r25,0(0,%r31)
ldo 16(%r29),%r29
ldo 16(%r2),%r2
ldo 16(%r31),%r31
bl L$0026,0
ldo 16(%r26),%r26
L$0027
ldw -84(0,%r30),%r2
bv 0(%r2)
ldwm -64(0,%r30),%r4
.EXIT
.PROCEND
.align 4
.EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
bn_sqr_words
.PROC
.CALLINFO FRAME=0,NO_CALLS
.ENTRY
ldo 28(%r26),%r19
ldo 12(%r25),%r28
L$0042
fldws 0(0,%r25),%fr8L
fldws 0(0,%r25),%fr8R
xmpyu %fr8L,%fr8R,%fr8
fstds %fr8,-16(0,%r30)
ldw -16(0,%r30),%r22
ldw -12(0,%r30),%r23
stw %r23,0(0,%r26)
copy %r22,%r21
ldi 0,%r20
addib,= -1,%r24,L$0049
stw %r21,-24(0,%r19)
fldws -8(0,%r28),%fr8L
fldws -8(0,%r28),%fr8R
xmpyu %fr8L,%fr8R,%fr8
fstds %fr8,-16(0,%r30)
ldw -16(0,%r30),%r22
ldw -12(0,%r30),%r23
stw %r23,-20(0,%r19)
copy %r22,%r21
ldi 0,%r20
addib,= -1,%r24,L$0049
stw %r21,-16(0,%r19)
fldws -4(0,%r28),%fr8L
fldws -4(0,%r28),%fr8R
xmpyu %fr8L,%fr8R,%fr8
fstds %fr8,-16(0,%r30)
ldw -16(0,%r30),%r22
ldw -12(0,%r30),%r23
stw %r23,-12(0,%r19)
copy %r22,%r21
ldi 0,%r20
addib,= -1,%r24,L$0049
stw %r21,-8(0,%r19)
fldws 0(0,%r28),%fr8L
fldws 0(0,%r28),%fr8R
xmpyu %fr8L,%fr8R,%fr8
fstds %fr8,-16(0,%r30)
ldw -16(0,%r30),%r22
ldw -12(0,%r30),%r23
stw %r23,-4(0,%r19)
copy %r22,%r21
ldi 0,%r20
addib,= -1,%r24,L$0049
stw %r21,0(0,%r19)
ldo 16(%r28),%r28
ldo 16(%r25),%r25
ldo 32(%r19),%r19
bl L$0042,0
ldo 32(%r26),%r26
L$0049
bv,n 0(%r2)
.EXIT
.PROCEND
.IMPORT BN_num_bits_word,CODE
.IMPORT fprintf,CODE
.IMPORT __iob,DATA
.SPACE $TEXT$
.SUBSPA $LIT$
.align 4
L$C0000
.STRING "Division would overflow (%d)\x0a\x00"
.IMPORT abort,CODE
.SPACE $TEXT$
.SUBSPA $CODE$
.align 4
.EXPORT bn_div64,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR
bn_div64
.PROC
.CALLINFO FRAME=128,CALLS,SAVE_RP,ENTRY_GR=8
.ENTRY
stw %r2,-20(0,%r30)
stwm %r8,128(0,%r30)
stw %r7,-124(0,%r30)
stw %r4,-112(0,%r30)
stw %r3,-108(0,%r30)
copy %r26,%r3
copy %r25,%r4
stw %r6,-120(0,%r30)
ldi 0,%r7
stw %r5,-116(0,%r30)
movb,<> %r24,%r5,L$0051
ldi 2,%r6
bl L$0068,0
ldi -1,%r28
L$0051
.CALL ARGW0=GR
bl BN_num_bits_word,%r2
copy %r5,%r26
copy %r28,%r24
ldi 32,%r19
comb,= %r19,%r24,L$0052
subi 31,%r24,%r19
mtsar %r19
zvdepi 1,32,%r19
comb,>>= %r19,%r3,L$0052
addil LR'__iob-$global$+32,%r27
ldo RR'__iob-$global$+32(%r1),%r26
ldil LR'L$C0000,%r25
.CALL ARGW0=GR,ARGW1=GR,ARGW2=GR
bl fprintf,%r2
ldo RR'L$C0000(%r25),%r25
.CALL
bl abort,%r2
nop
L$0052
comb,>> %r5,%r3,L$0053
subi 32,%r24,%r24
sub %r3,%r5,%r3
L$0053
comib,= 0,%r24,L$0054
subi 31,%r24,%r19
mtsar %r19
zvdep %r5,32,%r5
zvdep %r3,32,%r21
subi 32,%r24,%r20
mtsar %r20
vshd 0,%r4,%r20
or %r21,%r20,%r3
mtsar %r19
zvdep %r4,32,%r4
L$0054
extru %r5,15,16,%r23
extru %r5,31,16,%r28
L$0055
extru %r3,15,16,%r19
comb,<> %r23,%r19,L$0058
copy %r3,%r26
bl L$0059,0
zdepi -1,31,16,%r29
L$0058
.IMPORT $$divU,MILLICODE
bl $$divU,%r31
copy %r23,%r25
L$0059
stw %r29,-16(0,%r30)
fldws -16(0,%r30),%fr10L
stw %r28,-16(0,%r30)
fldws -16(0,%r30),%fr10R
stw %r23,-16(0,%r30)
xmpyu %fr10L,%fr10R,%fr8
fldws -16(0,%r30),%fr10R
fstws %fr8R,-16(0,%r30)
xmpyu %fr10L,%fr10R,%fr9
ldw -16(0,%r30),%r8
fstws %fr9R,-16(0,%r30)
copy %r8,%r22
ldw -16(0,%r30),%r8
extru %r4,15,16,%r24
copy %r8,%r21
L$0060
sub %r3,%r21,%r20
copy %r20,%r19
depi 0,31,16,%r19
comib,<> 0,%r19,L$0061
zdep %r20,15,16,%r19
addl %r19,%r24,%r19
comb,>>= %r19,%r22,L$0061
sub %r22,%r28,%r22
sub %r21,%r23,%r21
bl L$0060,0
ldo -1(%r29),%r29
L$0061
stw %r29,-16(0,%r30)
fldws -16(0,%r30),%fr10L
stw %r28,-16(0,%r30)
fldws -16(0,%r30),%fr10R
xmpyu %fr10L,%fr10R,%fr8
fstws %fr8R,-16(0,%r30)
ldw -16(0,%r30),%r8
stw %r23,-16(0,%r30)
fldws -16(0,%r30),%fr10R
copy %r8,%r19
xmpyu %fr10L,%fr10R,%fr8
fstws %fr8R,-16(0,%r30)
extru %r19,15,16,%r20
ldw -16(0,%r30),%r8
zdep %r19,15,16,%r19
addl %r8,%r20,%r20
comclr,<<= %r19,%r4,0
addi 1,%r20,%r20
comb,<<= %r20,%r3,L$0066
sub %r4,%r19,%r4
addl %r3,%r5,%r3
ldo -1(%r29),%r29
L$0066
addib,= -1,%r6,L$0056
sub %r3,%r20,%r3
zdep %r29,15,16,%r7
shd %r3,%r4,16,%r3
bl L$0055,0
zdep %r4,15,16,%r4
L$0056
or %r7,%r29,%r28
L$0068
ldw -148(0,%r30),%r2
ldw -124(0,%r30),%r7
ldw -120(0,%r30),%r6
ldw -116(0,%r30),%r5
ldw -112(0,%r30),%r4
ldw -108(0,%r30),%r3
bv 0(%r2)
ldwm -128(0,%r30),%r8
.EXIT
.PROCEND
;
; PA-RISC 64-bit implementation of bn_asm code
;
; This code is approximately 2x faster than the C version
; for RSA/DSA.
;
; See http://devresource.hp.com/ for more details on the PA-RISC
; architecture. Also see the book "PA-RISC 2.0 Architecture"
; by Gerry Kane for information on the instruction set architecture.
;
; Code written by Chris Ruemmler (with some help from the HP C
; compiler).
;
; The code compiles with HP's assembler
;
.level 2.0W
.space $TEXT$
.subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
;
; Global Register definitions used for the routines.
;
; Some information about HP's runtime architecture for 64-bits.
;
; "Caller save" means the calling function must save the register
; if it wants the register to be preserved.
; "Callee save" means if a function uses the register, it must save
; the value before using it.
;
; For the floating point registers
;
; "caller save" registers: fr4-fr11, fr22-fr31
; "callee save" registers: fr12-fr21
; "special" registers: fr0-fr3 (status and exception registers)
;
; For the integer registers
; value zero : r0
; "caller save" registers: r1,r19-r26
; "callee save" registers: r3-r18
; return register : r2 (rp)
; return values ; r28 (ret0,ret1)
; Stack pointer ; r30 (sp)
; global data pointer ; r27 (dp)
; argument pointer ; r29 (ap)
; millicode return ptr ; r31 (also a caller save register)
;
; Arguments to the routines
;
r_ptr .reg %r26
a_ptr .reg %r25
b_ptr .reg %r24
num .reg %r24
w .reg %r23
n .reg %r23
;
; Globals used in some routines
;
top_overflow .reg %r29
high_mask .reg %r22 ; value 0xffffffff80000000L
;------------------------------------------------------------------------------
;
; bn_mul_add_words
;
;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
; int num, BN_ULONG w)
;
; arg0 = r_ptr
; arg1 = a_ptr
; arg2 = num
; arg3 = w
;
; Local register definitions
;
fm1 .reg %fr22
fm .reg %fr23
ht_temp .reg %fr24
ht_temp_1 .reg %fr25
lt_temp .reg %fr26
lt_temp_1 .reg %fr27
fm1_1 .reg %fr28
fm_1 .reg %fr29
fw_h .reg %fr7L
fw_l .reg %fr7R
fw .reg %fr7
fht_0 .reg %fr8L
flt_0 .reg %fr8R
t_float_0 .reg %fr8
fht_1 .reg %fr9L
flt_1 .reg %fr9R
t_float_1 .reg %fr9
tmp_0 .reg %r31
tmp_1 .reg %r21
m_0 .reg %r20
m_1 .reg %r19
ht_0 .reg %r1
ht_1 .reg %r3
lt_0 .reg %r4
lt_1 .reg %r5
m1_0 .reg %r6
m1_1 .reg %r7
rp_val .reg %r8
rp_val_1 .reg %r9
bn_mul_add_words
.export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
.proc
.callinfo frame=128
.entry
.align 64
STD %r3,0(%sp) ; save r3
STD %r4,8(%sp) ; save r4
NOP ; Needed to make the loop 16-byte aligned
NOP ; Needed to make the loop 16-byte aligned
STD %r5,16(%sp) ; save r5
STD %r6,24(%sp) ; save r6
STD %r7,32(%sp) ; save r7
STD %r8,40(%sp) ; save r8
STD %r9,48(%sp) ; save r9
COPY %r0,%ret0 ; return 0 by default
DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
STD w,56(%sp) ; store w on stack
CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
LDO 128(%sp),%sp ; bump stack
;
; The loop is unrolled twice, so if there is only 1 number
; then go straight to the cleanup code.
;
CMPIB,= 1,num,bn_mul_add_words_single_top
FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
;
; This loop is unrolled 2 times (64-byte aligned as well)
;
; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
; two 32-bit mutiplies can be issued per cycle.
;
bn_mul_add_words_unroll2
FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
LDD 0(r_ptr),rp_val ; rp[0]
LDD 8(r_ptr),rp_val_1 ; rp[1]
XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
FSTD fm,-8(%sp) ; -8(sp) = m[0]
FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
LDD -8(%sp),m_0 ; m[0]
LDD -40(%sp),m_1 ; m[1]
LDD -16(%sp),m1_0 ; m1[0]
LDD -48(%sp),m1_1 ; m1[1]
LDD -24(%sp),ht_0 ; ht[0]
LDD -56(%sp),ht_1 ; ht[1]
ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
LDD -32(%sp),lt_0
LDD -64(%sp),lt_1
CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
ADD,DC ht_0,%r0,ht_0 ; ht[0]++
ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
ADD,DC ht_1,%r0,ht_1 ; ht[1]++
ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c;
ADD,DC ht_0,%r0,ht_0 ; ht[0]++
ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
ADD,DC ht_0,%r0,ht_0 ; ht[0]++
LDO -2(num),num ; num = num - 2;
ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
ADD,DC ht_1,%r0,ht_1 ; ht[1]++
STD lt_0,0(r_ptr) ; rp[0] = lt[0]
ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
ADD,DC ht_1,%r0,%ret0 ; ht[1]++
LDO 16(a_ptr),a_ptr ; a_ptr += 2
STD lt_1,8(r_ptr) ; rp[1] = lt[1]
CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
LDO 16(r_ptr),r_ptr ; r_ptr += 2
CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
;
; Top of loop aligned on 64-byte boundary
;
bn_mul_add_words_single_top
FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
LDD 0(r_ptr),rp_val ; rp[0]
LDO 8(a_ptr),a_ptr ; a_ptr++
XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
FSTD fm1,-16(%sp) ; -16(sp) = m1
XMPYU flt_0,fw_h,fm ; m = lt*fw_h
FSTD fm,-8(%sp) ; -8(sp) = m
XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
FSTD ht_temp,-24(%sp) ; -24(sp) = ht
XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
FSTD lt_temp,-32(%sp) ; -32(sp) = lt
LDD -8(%sp),m_0
LDD -16(%sp),m1_0 ; m1 = temp1
ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
LDD -24(%sp),ht_0
LDD -32(%sp),lt_0
CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
EXTRD,U tmp_0,31,32,m_0 ; m>>32
DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
ADD,DC ht_0,%r0,ht_0 ; ht++
ADD %ret0,tmp_0,lt_0 ; lt = lt + c;
ADD,DC ht_0,%r0,ht_0 ; ht++
ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
ADD,DC ht_0,%r0,%ret0 ; ht++
STD lt_0,0(r_ptr) ; rp[0] = lt
bn_mul_add_words_exit
.EXIT
LDD -80(%sp),%r9 ; restore r9
LDD -88(%sp),%r8 ; restore r8
LDD -96(%sp),%r7 ; restore r7
LDD -104(%sp),%r6 ; restore r6
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3 ; restore r3
.PROCEND ;in=23,24,25,26,29;out=28;
;----------------------------------------------------------------------------
;
;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
;
; arg0 = rp
; arg1 = ap
; arg2 = num
; arg3 = w
bn_mul_words
.proc
.callinfo frame=128
.entry
.EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.align 64
STD %r3,0(%sp) ; save r3
STD %r4,8(%sp) ; save r4
STD %r5,16(%sp) ; save r5
STD %r6,24(%sp) ; save r6
STD %r7,32(%sp) ; save r7
COPY %r0,%ret0 ; return 0 by default
DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
STD w,56(%sp) ; w on stack
CMPIB,>= 0,num,bn_mul_words_exit
LDO 128(%sp),%sp ; bump stack
;
; See if only 1 word to do, thus just do cleanup
;
CMPIB,= 1,num,bn_mul_words_single_top
FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
;
; This loop is unrolled 2 times (64-byte aligned as well)
;
; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
; two 32-bit mutiplies can be issued per cycle.
;
bn_mul_words_unroll2
FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
FSTD fm1,-16(%sp) ; -16(sp) = m1
FSTD fm1_1,-48(%sp) ; -48(sp) = m1
XMPYU flt_0,fw_h,fm ; m = lt*fw_h
XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
FSTD fm,-8(%sp) ; -8(sp) = m
FSTD fm_1,-40(%sp) ; -40(sp) = m
XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
FSTD ht_temp,-24(%sp) ; -24(sp) = ht
FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
FSTD lt_temp,-32(%sp) ; -32(sp) = lt
FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
LDD -8(%sp),m_0
LDD -40(%sp),m_1
LDD -16(%sp),m1_0
LDD -48(%sp),m1_1
LDD -24(%sp),ht_0
LDD -56(%sp),ht_1
ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
LDD -32(%sp),lt_0
LDD -64(%sp),lt_1
CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
EXTRD,U tmp_0,31,32,m_0 ; m>>32
DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
EXTRD,U tmp_1,31,32,m_1 ; m>>32
DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
ADD,DC ht_0,%r0,ht_0 ; ht++
ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
ADD,DC ht_1,%r0,ht_1 ; ht++
ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0);
ADD,DC ht_0,%r0,ht_0 ; ht++
ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
ADD,DC ht_1,%r0,ht_1 ; ht++
STD lt_0,0(r_ptr) ; rp[0] = lt
STD lt_1,8(r_ptr) ; rp[1] = lt
COPY ht_1,%ret0 ; carry = ht
LDO -2(num),num ; num = num - 2;
LDO 16(a_ptr),a_ptr ; ap += 2
CMPIB,<= 2,num,bn_mul_words_unroll2
LDO 16(r_ptr),r_ptr ; rp++
CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
;
; Top of loop aligned on 64-byte boundary
;
bn_mul_words_single_top
FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
FSTD fm1,-16(%sp) ; -16(sp) = m1
XMPYU flt_0,fw_h,fm ; m = lt*fw_h
FSTD fm,-8(%sp) ; -8(sp) = m
XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
FSTD ht_temp,-24(%sp) ; -24(sp) = ht
XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
FSTD lt_temp,-32(%sp) ; -32(sp) = lt
LDD -8(%sp),m_0
LDD -16(%sp),m1_0
ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
LDD -24(%sp),ht_0
LDD -32(%sp),lt_0
CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
EXTRD,U tmp_0,31,32,m_0 ; m>>32
DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
ADD,DC ht_0,%r0,ht_0 ; ht++
ADD %ret0,lt_0,lt_0 ; lt = lt + c;
ADD,DC ht_0,%r0,ht_0 ; ht++
COPY ht_0,%ret0 ; copy carry
STD lt_0,0(r_ptr) ; rp[0] = lt
bn_mul_words_exit
.EXIT
LDD -96(%sp),%r7 ; restore r7
LDD -104(%sp),%r6 ; restore r6
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3 ; restore r3
.PROCEND ;in=23,24,25,26,29;out=28;
;----------------------------------------------------------------------------
;
;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
;
; arg0 = rp
; arg1 = ap
; arg2 = num
;
bn_sqr_words
.proc
.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
.EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.entry
.align 64
STD %r3,0(%sp) ; save r3
STD %r4,8(%sp) ; save r4
NOP
STD %r5,16(%sp) ; save r5
CMPIB,>= 0,num,bn_sqr_words_exit
LDO 128(%sp),%sp ; bump stack
;
; If only 1, the goto straight to cleanup
;
CMPIB,= 1,num,bn_sqr_words_single_top
DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
;
; This loop is unrolled 2 times (64-byte aligned as well)
;
bn_sqr_words_unroll2
FLDD 0(a_ptr),t_float_0 ; a[0]
FLDD 8(a_ptr),t_float_1 ; a[1]
XMPYU fht_0,flt_0,fm ; m[0]
XMPYU fht_1,flt_1,fm_1 ; m[1]
FSTD fm,-24(%sp) ; store m[0]
FSTD fm_1,-56(%sp) ; store m[1]
XMPYU flt_0,flt_0,lt_temp ; lt[0]
XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
FSTD lt_temp,-16(%sp) ; store lt[0]
FSTD lt_temp_1,-48(%sp) ; store lt[1]
XMPYU fht_0,fht_0,ht_temp ; ht[0]
XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
FSTD ht_temp,-8(%sp) ; store ht[0]
FSTD ht_temp_1,-40(%sp) ; store ht[1]
LDD -24(%sp),m_0
LDD -56(%sp),m_1
AND m_0,high_mask,tmp_0 ; m[0] & Mask
AND m_1,high_mask,tmp_1 ; m[1] & Mask
DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
LDD -16(%sp),lt_0
LDD -48(%sp),lt_1
EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
LDD -8(%sp),ht_0
LDD -40(%sp),ht_1
ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
ADD lt_0,m_0,lt_0 ; lt = lt+m
ADD,DC ht_0,%r0,ht_0 ; ht[0]++
STD lt_0,0(r_ptr) ; rp[0] = lt[0]
STD ht_0,8(r_ptr) ; rp[1] = ht[1]
ADD lt_1,m_1,lt_1 ; lt = lt+m
ADD,DC ht_1,%r0,ht_1 ; ht[1]++
STD lt_1,16(r_ptr) ; rp[2] = lt[1]
STD ht_1,24(r_ptr) ; rp[3] = ht[1]
LDO -2(num),num ; num = num - 2;
LDO 16(a_ptr),a_ptr ; ap += 2
CMPIB,<= 2,num,bn_sqr_words_unroll2
LDO 32(r_ptr),r_ptr ; rp += 4
CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
;
; Top of loop aligned on 64-byte boundary
;
bn_sqr_words_single_top
FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
XMPYU fht_0,flt_0,fm ; m
FSTD fm,-24(%sp) ; store m
XMPYU flt_0,flt_0,lt_temp ; lt
FSTD lt_temp,-16(%sp) ; store lt
XMPYU fht_0,fht_0,ht_temp ; ht
FSTD ht_temp,-8(%sp) ; store ht
LDD -24(%sp),m_0 ; load m
AND m_0,high_mask,tmp_0 ; m & Mask
DEPD,Z m_0,30,31,m_0 ; m << 32+1
LDD -16(%sp),lt_0 ; lt
LDD -8(%sp),ht_0 ; ht
EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
ADD m_0,lt_0,lt_0 ; lt = lt+m
ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
ADD,DC ht_0,%r0,ht_0 ; ht++
STD lt_0,0(r_ptr) ; rp[0] = lt
STD ht_0,8(r_ptr) ; rp[1] = ht
bn_sqr_words_exit
.EXIT
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3
.PROCEND ;in=23,24,25,26,29;out=28;
;----------------------------------------------------------------------------
;
;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
;
; arg0 = rp
; arg1 = ap
; arg2 = bp
; arg3 = n
t .reg %r22
b .reg %r21
l .reg %r20
bn_add_words
.proc
.entry
.callinfo
.EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.align 64
CMPIB,>= 0,n,bn_add_words_exit
COPY %r0,%ret0 ; return 0 by default
;
; If 2 or more numbers do the loop
;
CMPIB,= 1,n,bn_add_words_single_top
NOP
;
; This loop is unrolled 2 times (64-byte aligned as well)
;
bn_add_words_unroll2
LDD 0(a_ptr),t
LDD 0(b_ptr),b
ADD t,%ret0,t ; t = t+c;
ADD,DC %r0,%r0,%ret0 ; set c to carry
ADD t,b,l ; l = t + b[0]
ADD,DC %ret0,%r0,%ret0 ; c+= carry
STD l,0(r_ptr)
LDD 8(a_ptr),t
LDD 8(b_ptr),b
ADD t,%ret0,t ; t = t+c;
ADD,DC %r0,%r0,%ret0 ; set c to carry
ADD t,b,l ; l = t + b[0]
ADD,DC %ret0,%r0,%ret0 ; c+= carry
STD l,8(r_ptr)
LDO -2(n),n
LDO 16(a_ptr),a_ptr
LDO 16(b_ptr),b_ptr
CMPIB,<= 2,n,bn_add_words_unroll2
LDO 16(r_ptr),r_ptr
CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
bn_add_words_single_top
LDD 0(a_ptr),t
LDD 0(b_ptr),b
ADD t,%ret0,t ; t = t+c;
ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??)
ADD t,b,l ; l = t + b[0]
ADD,DC %ret0,%r0,%ret0 ; c+= carry
STD l,0(r_ptr)
bn_add_words_exit
.EXIT
BVE (%rp)
NOP
.PROCEND ;in=23,24,25,26,29;out=28;
;----------------------------------------------------------------------------
;
;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
;
; arg0 = rp
; arg1 = ap
; arg2 = bp
; arg3 = n
t1 .reg %r22
t2 .reg %r21
sub_tmp1 .reg %r20
sub_tmp2 .reg %r19
bn_sub_words
.proc
.callinfo
.EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.entry
.align 64
CMPIB,>= 0,n,bn_sub_words_exit
COPY %r0,%ret0 ; return 0 by default
;
; If 2 or more numbers do the loop
;
CMPIB,= 1,n,bn_sub_words_single_top
NOP
;
; This loop is unrolled 2 times (64-byte aligned as well)
;
bn_sub_words_unroll2
LDD 0(a_ptr),t1
LDD 0(b_ptr),t2
SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
LDO 1(%r0),sub_tmp2
CMPCLR,*= t1,t2,%r0
COPY sub_tmp2,%ret0
STD sub_tmp1,0(r_ptr)
LDD 8(a_ptr),t1
LDD 8(b_ptr),t2
SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
LDO 1(%r0),sub_tmp2
CMPCLR,*= t1,t2,%r0
COPY sub_tmp2,%ret0
STD sub_tmp1,8(r_ptr)
LDO -2(n),n
LDO 16(a_ptr),a_ptr
LDO 16(b_ptr),b_ptr
CMPIB,<= 2,n,bn_sub_words_unroll2
LDO 16(r_ptr),r_ptr
CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
bn_sub_words_single_top
LDD 0(a_ptr),t1
LDD 0(b_ptr),t2
SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
LDO 1(%r0),sub_tmp2
CMPCLR,*= t1,t2,%r0
COPY sub_tmp2,%ret0
STD sub_tmp1,0(r_ptr)
bn_sub_words_exit
.EXIT
BVE (%rp)
NOP
.PROCEND ;in=23,24,25,26,29;out=28;
;------------------------------------------------------------------------------
;
; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
;
; arg0 = h
; arg1 = l
; arg2 = d
;
; This is mainly just modified assembly from the compiler, thus the
; lack of variable names.
;
;------------------------------------------------------------------------------
bn_div_words
.proc
.callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
.EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.IMPORT BN_num_bits_word,CODE,NO_RELOCATION
.IMPORT __iob,DATA
.IMPORT fprintf,CODE,NO_RELOCATION
.IMPORT abort,CODE,NO_RELOCATION
.IMPORT $$div2U,MILLICODE
.entry
STD %r2,-16(%r30)
STD,MA %r3,352(%r30)
STD %r4,-344(%r30)
STD %r5,-336(%r30)
STD %r6,-328(%r30)
STD %r7,-320(%r30)
STD %r8,-312(%r30)
STD %r9,-304(%r30)
STD %r10,-296(%r30)
STD %r27,-288(%r30) ; save gp
COPY %r24,%r3 ; save d
COPY %r26,%r4 ; save h (high 64-bits)
LDO -1(%r0),%ret0 ; return -1 by default
CMPB,*= %r0,%arg2,$D3 ; if (d == 0)
COPY %r25,%r5 ; save l (low 64-bits)
LDO -48(%r30),%r29 ; create ap
.CALL ;in=26,29;out=28;
B,L BN_num_bits_word,%r2
COPY %r3,%r26
LDD -288(%r30),%r27 ; restore gp
LDI 64,%r21
CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward)
COPY %ret0,%r24 ; i
MTSARCM %r24
DEPDI,Z -1,%sar,1,%r29
CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward)
$00000012
SUBI 64,%r24,%r31 ; i = 64 - i;
CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d)
SUB %r4,%r3,%r4 ; h -= d
CMPB,= %r31,%r0,$0000001A ; if (i)
COPY %r0,%r10 ; ret = 0
MTSARCM %r31 ; i to shift
DEPD,Z %r3,%sar,64,%r3 ; d <<= i;
SUBI 64,%r31,%r19 ; 64 - i; redundent
MTSAR %r19 ; (64 -i) to shift
SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i)
MTSARCM %r31 ; i to shift
DEPD,Z %r5,%sar,64,%r5 ; l <<= i;
$0000001A
DEPDI,Z -1,31,32,%r19
EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32
EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff
LDO 2(%r0),%r9
STD %r3,-280(%r30) ; "d" to stack
$0000001C
DEPDI,Z -1,63,32,%r29 ;
EXTRD,U %r4,31,32,%r31 ; h >> 32
CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div
COPY %r4,%r26
EXTRD,U %r4,31,32,%r25
COPY %r6,%r24
.CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
B,L $$div2U,%r2
EXTRD,U %r6,31,32,%r23
DEPD %r28,31,32,%r29
$D2
STD %r29,-272(%r30) ; q
AND %r5,%r19,%r24 ; t & 0xffffffff00000000;
EXTRD,U %r24,31,32,%r24 ; ???
FLDD -272(%r30),%fr7 ; q
FLDD -280(%r30),%fr8 ; d
XMPYU %fr8L,%fr7L,%fr10
FSTD %fr10,-256(%r30)
XMPYU %fr8L,%fr7R,%fr22
FSTD %fr22,-264(%r30)
XMPYU %fr8R,%fr7L,%fr11
XMPYU %fr8R,%fr7R,%fr23
FSTD %fr11,-232(%r30)
FSTD %fr23,-240(%r30)
LDD -256(%r30),%r28
DEPD,Z %r28,31,32,%r2
LDD -264(%r30),%r20
ADD,L %r20,%r2,%r31
LDD -232(%r30),%r22
DEPD,Z %r22,31,32,%r22
LDD -240(%r30),%r21
B $00000024 ; enter loop
ADD,L %r21,%r22,%r23
$0000002A
LDO -1(%r29),%r29
SUB %r23,%r8,%r23
$00000024
SUB %r4,%r31,%r25
AND %r25,%r19,%r26
CMPB,*<>,N %r0,%r26,$00000046 ; (forward)
DEPD,Z %r25,31,32,%r20
OR %r20,%r24,%r21
CMPB,*<<,N %r21,%r23,$0000002A ;(backward)
SUB %r31,%r6,%r31
;-------------Break path---------------------
$00000046
DEPD,Z %r23,31,32,%r25 ;tl
EXTRD,U %r23,31,32,%r26 ;t
AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L
ADD,L %r31,%r26,%r31 ;th += t;
CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl)
LDO 1(%r31),%r31 ; th++;
CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward)
LDO -1(%r29),%r29 ;q--;
ADD,L %r4,%r3,%r4 ;h += d;
$00000036
ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward)
SUB %r5,%r24,%r28 ; l -= tl;
SUB %r4,%r31,%r24 ; h -= th;
SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32));
DEPD,Z %r29,31,32,%r10 ; ret = q<<32
b $0000001C
DEPD,Z %r28,31,32,%r5 ; l = l << 32
$D1
OR %r10,%r29,%r28 ; ret |= q
$D3
LDD -368(%r30),%r2
$D0
LDD -296(%r30),%r10
LDD -304(%r30),%r9
LDD -312(%r30),%r8
LDD -320(%r30),%r7
LDD -328(%r30),%r6
LDD -336(%r30),%r5
LDD -344(%r30),%r4
BVE (%r2)
.EXIT
LDD,MB -352(%r30),%r3
bn_div_err_case
MFIA %r6
ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1
LDO R'bn_div_words-bn_div_err_case(%r1),%r6
ADDIL LT'__iob,%r27,%r1
LDD RT'__iob(%r1),%r26
ADDIL L'C$4-bn_div_words,%r6,%r1
LDO R'C$4-bn_div_words(%r1),%r25
LDO 64(%r26),%r26
.CALL ;in=24,25,26,29;out=28;
B,L fprintf,%r2
LDO -48(%r30),%r29
LDD -288(%r30),%r27
.CALL ;in=29;
B,L abort,%r2
LDO -48(%r30),%r29
LDD -288(%r30),%r27
B $D0
LDD -368(%r30),%r2
.PROCEND ;in=24,25,26,29;out=28;
;----------------------------------------------------------------------------
;
; Registers to hold 64-bit values to manipulate. The "L" part
; of the register corresponds to the upper 32-bits, while the "R"
; part corresponds to the lower 32-bits
;
; Note, that when using b6 and b7, the code must save these before
; using them because they are callee save registers
;
;
; Floating point registers to use to save values that
; are manipulated. These don't collide with ftemp1-6 and
; are all caller save registers
;
a0 .reg %fr22
a0L .reg %fr22L
a0R .reg %fr22R
a1 .reg %fr23
a1L .reg %fr23L
a1R .reg %fr23R
a2 .reg %fr24
a2L .reg %fr24L
a2R .reg %fr24R
a3 .reg %fr25
a3L .reg %fr25L
a3R .reg %fr25R
a4 .reg %fr26
a4L .reg %fr26L
a4R .reg %fr26R
a5 .reg %fr27
a5L .reg %fr27L
a5R .reg %fr27R
a6 .reg %fr28
a6L .reg %fr28L
a6R .reg %fr28R
a7 .reg %fr29
a7L .reg %fr29L
a7R .reg %fr29R
b0 .reg %fr30
b0L .reg %fr30L
b0R .reg %fr30R
b1 .reg %fr31
b1L .reg %fr31L
b1R .reg %fr31R
;
; Temporary floating point variables, these are all caller save
; registers
;
ftemp1 .reg %fr4
ftemp2 .reg %fr5
ftemp3 .reg %fr6
ftemp4 .reg %fr7
;
; The B set of registers when used.
;
b2 .reg %fr8
b2L .reg %fr8L
b2R .reg %fr8R
b3 .reg %fr9
b3L .reg %fr9L
b3R .reg %fr9R
b4 .reg %fr10
b4L .reg %fr10L
b4R .reg %fr10R
b5 .reg %fr11
b5L .reg %fr11L
b5R .reg %fr11R
b6 .reg %fr12
b6L .reg %fr12L
b6R .reg %fr12R
b7 .reg %fr13
b7L .reg %fr13L
b7R .reg %fr13R
c1 .reg %r21 ; only reg
temp1 .reg %r20 ; only reg
temp2 .reg %r19 ; only reg
temp3 .reg %r31 ; only reg
m1 .reg %r28
c2 .reg %r23
high_one .reg %r1
ht .reg %r6
lt .reg %r5
m .reg %r4
c3 .reg %r3
SQR_ADD_C .macro A0L,A0R,C1,C2,C3
XMPYU A0L,A0R,ftemp1 ; m
FSTD ftemp1,-24(%sp) ; store m
XMPYU A0R,A0R,ftemp2 ; lt
FSTD ftemp2,-16(%sp) ; store lt
XMPYU A0L,A0L,ftemp3 ; ht
FSTD ftemp3,-8(%sp) ; store ht
LDD -24(%sp),m ; load m
AND m,high_mask,temp2 ; m & Mask
DEPD,Z m,30,31,temp3 ; m << 32+1
LDD -16(%sp),lt ; lt
LDD -8(%sp),ht ; ht
EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
ADD temp3,lt,lt ; lt = lt+m
ADD,L ht,temp1,ht ; ht += temp1
ADD,DC ht,%r0,ht ; ht++
ADD C1,lt,C1 ; c1=c1+lt
ADD,DC ht,%r0,ht ; ht++
ADD C2,ht,C2 ; c2=c2+ht
ADD,DC C3,%r0,C3 ; c3++
.endm
SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
FSTD ftemp1,-16(%sp) ;
XMPYU A0R,A1L,ftemp2 ; m = bh*lt
FSTD ftemp2,-8(%sp) ;
XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
FSTD ftemp3,-32(%sp)
XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
FSTD ftemp4,-24(%sp) ;
LDD -8(%sp),m ; r21 = m
LDD -16(%sp),m1 ; r19 = m1
ADD,L m,m1,m ; m+m1
DEPD,Z m,31,32,temp3 ; (m+m1<<32)
LDD -24(%sp),ht ; r24 = ht
CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
ADD,L ht,high_one,ht ; ht+=high_one
EXTRD,U m,31,32,temp1 ; m >> 32
LDD -32(%sp),lt ; lt
ADD,L ht,temp1,ht ; ht+= m>>32
ADD lt,temp3,lt ; lt = lt+m1
ADD,DC ht,%r0,ht ; ht++
ADD ht,ht,ht ; ht=ht+ht;
ADD,DC C3,%r0,C3 ; add in carry (c3++)
ADD lt,lt,lt ; lt=lt+lt;
ADD,DC ht,%r0,ht ; add in carry (ht++)
ADD C1,lt,C1 ; c1=c1+lt
ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
ADD C2,ht,C2 ; c2 = c2 + ht
ADD,DC C3,%r0,C3 ; add in carry (c3++)
.endm
;
;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
; arg0 = r_ptr
; arg1 = a_ptr
;
bn_sqr_comba8
.PROC
.CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
.EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.ENTRY
.align 64
STD %r3,0(%sp) ; save r3
STD %r4,8(%sp) ; save r4
STD %r5,16(%sp) ; save r5
STD %r6,24(%sp) ; save r6
;
; Zero out carries
;
COPY %r0,c1
COPY %r0,c2
COPY %r0,c3
LDO 128(%sp),%sp ; bump stack
DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
;
; Load up all of the values we are going to use
;
FLDD 0(a_ptr),a0
FLDD 8(a_ptr),a1
FLDD 16(a_ptr),a2
FLDD 24(a_ptr),a3
FLDD 32(a_ptr),a4
FLDD 40(a_ptr),a5
FLDD 48(a_ptr),a6
FLDD 56(a_ptr),a7
SQR_ADD_C a0L,a0R,c1,c2,c3
STD c1,0(r_ptr) ; r[0] = c1;
COPY %r0,c1
SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
STD c2,8(r_ptr) ; r[1] = c2;
COPY %r0,c2
SQR_ADD_C a1L,a1R,c3,c1,c2
SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
STD c3,16(r_ptr) ; r[2] = c3;
COPY %r0,c3
SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
STD c1,24(r_ptr) ; r[3] = c1;
COPY %r0,c1
SQR_ADD_C a2L,a2R,c2,c3,c1
SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
STD c2,32(r_ptr) ; r[4] = c2;
COPY %r0,c2
SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
STD c3,40(r_ptr) ; r[5] = c3;
COPY %r0,c3
SQR_ADD_C a3L,a3R,c1,c2,c3
SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
STD c1,48(r_ptr) ; r[6] = c1;
COPY %r0,c1
SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
STD c2,56(r_ptr) ; r[7] = c2;
COPY %r0,c2
SQR_ADD_C a4L,a4R,c3,c1,c2
SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
STD c3,64(r_ptr) ; r[8] = c3;
COPY %r0,c3
SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
STD c1,72(r_ptr) ; r[9] = c1;
COPY %r0,c1
SQR_ADD_C a5L,a5R,c2,c3,c1
SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
STD c2,80(r_ptr) ; r[10] = c2;
COPY %r0,c2
SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
STD c3,88(r_ptr) ; r[11] = c3;
COPY %r0,c3
SQR_ADD_C a6L,a6R,c1,c2,c3
SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
STD c1,96(r_ptr) ; r[12] = c1;
COPY %r0,c1
SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
STD c2,104(r_ptr) ; r[13] = c2;
COPY %r0,c2
SQR_ADD_C a7L,a7R,c3,c1,c2
STD c3, 112(r_ptr) ; r[14] = c3
STD c1, 120(r_ptr) ; r[15] = c1
.EXIT
LDD -104(%sp),%r6 ; restore r6
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3
.PROCEND
;-----------------------------------------------------------------------------
;
;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
; arg0 = r_ptr
; arg1 = a_ptr
;
bn_sqr_comba4
.proc
.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
.EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.entry
.align 64
STD %r3,0(%sp) ; save r3
STD %r4,8(%sp) ; save r4
STD %r5,16(%sp) ; save r5
STD %r6,24(%sp) ; save r6
;
; Zero out carries
;
COPY %r0,c1
COPY %r0,c2
COPY %r0,c3
LDO 128(%sp),%sp ; bump stack
DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
;
; Load up all of the values we are going to use
;
FLDD 0(a_ptr),a0
FLDD 8(a_ptr),a1
FLDD 16(a_ptr),a2
FLDD 24(a_ptr),a3
FLDD 32(a_ptr),a4
FLDD 40(a_ptr),a5
FLDD 48(a_ptr),a6
FLDD 56(a_ptr),a7
SQR_ADD_C a0L,a0R,c1,c2,c3
STD c1,0(r_ptr) ; r[0] = c1;
COPY %r0,c1
SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
STD c2,8(r_ptr) ; r[1] = c2;
COPY %r0,c2
SQR_ADD_C a1L,a1R,c3,c1,c2
SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
STD c3,16(r_ptr) ; r[2] = c3;
COPY %r0,c3
SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
STD c1,24(r_ptr) ; r[3] = c1;
COPY %r0,c1
SQR_ADD_C a2L,a2R,c2,c3,c1
SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
STD c2,32(r_ptr) ; r[4] = c2;
COPY %r0,c2
SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
STD c3,40(r_ptr) ; r[5] = c3;
COPY %r0,c3
SQR_ADD_C a3L,a3R,c1,c2,c3
STD c1,48(r_ptr) ; r[6] = c1;
STD c2,56(r_ptr) ; r[7] = c2;
.EXIT
LDD -104(%sp),%r6 ; restore r6
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3
.PROCEND
;---------------------------------------------------------------------------
MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
FSTD ftemp1,-16(%sp) ;
XMPYU A0R,B0L,ftemp2 ; m = bh*lt
FSTD ftemp2,-8(%sp) ;
XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
FSTD ftemp3,-32(%sp)
XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
FSTD ftemp4,-24(%sp) ;
LDD -8(%sp),m ; r21 = m
LDD -16(%sp),m1 ; r19 = m1
ADD,L m,m1,m ; m+m1
DEPD,Z m,31,32,temp3 ; (m+m1<<32)
LDD -24(%sp),ht ; r24 = ht
CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
ADD,L ht,high_one,ht ; ht+=high_one
EXTRD,U m,31,32,temp1 ; m >> 32
LDD -32(%sp),lt ; lt
ADD,L ht,temp1,ht ; ht+= m>>32
ADD lt,temp3,lt ; lt = lt+m1
ADD,DC ht,%r0,ht ; ht++
ADD C1,lt,C1 ; c1=c1+lt
ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
ADD C2,ht,C2 ; c2 = c2 + ht
ADD,DC C3,%r0,C3 ; add in carry (c3++)
.endm
;
;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
; arg0 = r_ptr
; arg1 = a_ptr
; arg2 = b_ptr
;
bn_mul_comba8
.proc
.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
.EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.entry
.align 64
STD %r3,0(%sp) ; save r3
STD %r4,8(%sp) ; save r4
STD %r5,16(%sp) ; save r5
STD %r6,24(%sp) ; save r6
FSTD %fr12,32(%sp) ; save r6
FSTD %fr13,40(%sp) ; save r7
;
; Zero out carries
;
COPY %r0,c1
COPY %r0,c2
COPY %r0,c3
LDO 128(%sp),%sp ; bump stack
DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
;
; Load up all of the values we are going to use
;
FLDD 0(a_ptr),a0
FLDD 8(a_ptr),a1
FLDD 16(a_ptr),a2
FLDD 24(a_ptr),a3
FLDD 32(a_ptr),a4
FLDD 40(a_ptr),a5
FLDD 48(a_ptr),a6
FLDD 56(a_ptr),a7
FLDD 0(b_ptr),b0
FLDD 8(b_ptr),b1
FLDD 16(b_ptr),b2
FLDD 24(b_ptr),b3
FLDD 32(b_ptr),b4
FLDD 40(b_ptr),b5
FLDD 48(b_ptr),b6
FLDD 56(b_ptr),b7
MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
STD c1,0(r_ptr)
COPY %r0,c1
MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
STD c2,8(r_ptr)
COPY %r0,c2
MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
STD c3,16(r_ptr)
COPY %r0,c3
MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
STD c1,24(r_ptr)
COPY %r0,c1
MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
STD c2,32(r_ptr)
COPY %r0,c2
MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
STD c3,40(r_ptr)
COPY %r0,c3
MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
STD c1,48(r_ptr)
COPY %r0,c1
MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
STD c2,56(r_ptr)
COPY %r0,c2
MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
STD c3,64(r_ptr)
COPY %r0,c3
MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
STD c1,72(r_ptr)
COPY %r0,c1
MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
STD c2,80(r_ptr)
COPY %r0,c2
MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
STD c3,88(r_ptr)
COPY %r0,c3
MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
STD c1,96(r_ptr)
COPY %r0,c1
MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
STD c2,104(r_ptr)
COPY %r0,c2
MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
STD c3,112(r_ptr)
STD c1,120(r_ptr)
.EXIT
FLDD -88(%sp),%fr13
FLDD -96(%sp),%fr12
LDD -104(%sp),%r6 ; restore r6
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3
.PROCEND
;-----------------------------------------------------------------------------
;
;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
; arg0 = r_ptr
; arg1 = a_ptr
; arg2 = b_ptr
;
bn_mul_comba4
.proc
.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
.EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
.entry
.align 64
STD %r3,0(%sp) ; save r3
STD %r4,8(%sp) ; save r4
STD %r5,16(%sp) ; save r5
STD %r6,24(%sp) ; save r6
FSTD %fr12,32(%sp) ; save r6
FSTD %fr13,40(%sp) ; save r7
;
; Zero out carries
;
COPY %r0,c1
COPY %r0,c2
COPY %r0,c3
LDO 128(%sp),%sp ; bump stack
DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
;
; Load up all of the values we are going to use
;
FLDD 0(a_ptr),a0
FLDD 8(a_ptr),a1
FLDD 16(a_ptr),a2
FLDD 24(a_ptr),a3
FLDD 0(b_ptr),b0
FLDD 8(b_ptr),b1
FLDD 16(b_ptr),b2
FLDD 24(b_ptr),b3
MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
STD c1,0(r_ptr)
COPY %r0,c1
MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
STD c2,8(r_ptr)
COPY %r0,c2
MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
STD c3,16(r_ptr)
COPY %r0,c3
MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
STD c1,24(r_ptr)
COPY %r0,c1
MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
STD c2,32(r_ptr)
COPY %r0,c2
MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
STD c3,40(r_ptr)
COPY %r0,c3
MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
STD c1,48(r_ptr)
STD c2,56(r_ptr)
.EXIT
FLDD -88(%sp),%fr13
FLDD -96(%sp),%fr12
LDD -104(%sp),%r6 ; restore r6
LDD -112(%sp),%r5 ; restore r5
LDD -120(%sp),%r4 ; restore r4
BVE (%rp)
LDD,MB -128(%sp),%r3
.PROCEND
.SPACE $TEXT$
.SUBSPA $CODE$
.SPACE $PRIVATE$,SORT=16
.IMPORT $global$,DATA
.SPACE $TEXT$
.SUBSPA $CODE$
.SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=0x2c,SORT=16
C$4
.ALIGN 8
.STRINGZ "Division would overflow (%d)\n"
.END
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册