提交 d0c2ebf4 编写于 作者: R Richard Levitte

A patch from HP for better performance.

Submitted by Kevin Steves <ks@hp.se> 3 months ago...
上级 e17b7128
...@@ -212,12 +212,21 @@ my %table=( ...@@ -212,12 +212,21 @@ my %table=(
# crypto/sha/sha_lcl.h. # crypto/sha/sha_lcl.h.
# <appro@fy.chalmers.se> # <appro@fy.chalmers.se>
# #
"hpux-parisc-cc","cc:-Ae +O3 +ESlit -z -DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl", #!#"hpux-parisc-cc","cc:-Ae +O3 +ESlit -z -DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl",
# Since there is mention of this in shlib/hpux10-cc.sh # Since there is mention of this in shlib/hpux10-cc.sh
"hpux-parisc-cc-o4","cc:-Ae +O4 +ESlit -z -DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl", "hpux-parisc-cc-o4","cc:-Ae +O4 +ESlit -z -DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl",
"hpux-parisc-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl", "hpux-parisc-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W:::-ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1::::::::::dl",
"hpux64-parisc-cc","cc:-Ae +DD64 +O3 +ESlit -z -DB_ENDIAN -DMD32_XARRAY::-D_REENTRANT:-ldld:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::::::::::dl", "hpux64-parisc-cc","cc:-Ae +DD64 +O3 +ESlit -z -DB_ENDIAN -DMD32_XARRAY::-D_REENTRANT:-ldld:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::::::::::dl",
# More attempts at unified 10.X and 11.X targets for HP C compiler.
#
# Chris Ruemmler <ruemmler@cup.hp.com>
# Kevin Steves <ks@hp.se>
"hpux-parisc-cc","cc:+O3 +Optrs_strongly_typed +Olibcalls -Ae +ESlit -DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY::-D_REENTRANT:-ldl:MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::::::::::dl",
"hpux-parisc2-cc","cc:+DA2.0 +DS2.0 +O3 +Optrs_strongly_typed +Olibcalls -Ae +ESlit -DB_ENDIAN -DMD32_XARRAY::-D_REENTRANT:-ldl:SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT:asm/pa-risc2.o:::::::::dl",
"hpux64-parisc2-cc","cc:+DD64 +O3 +Optrs_strongly_typed +Olibcalls -Ae +ESlit -DB_ENDIAN -DMD32_XARRAY::-D_REENTRANT:-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT:asm/pa-risc2W.o:::::::::dl",
"hpux-parisc1_1-cc","cc:+DA1.1 +DS1.1 +O3 +Optrs_strongly_typed +Olibcalls -Ae +ESlit -DB_ENDIAN -DMD32_XARRAY::-D_REENTRANT:-ldl:MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::::::::::dl",
# HPUX 9.X config. # HPUX 9.X config.
# Don't use the bundled cc. It is broken. Use HP ANSI C if possible, or # Don't use the bundled cc. It is broken. Use HP ANSI C if possible, or
# egcs. gcc 2.8.1 is also broken. # egcs. gcc 2.8.1 is also broken.
......
...@@ -15,9 +15,9 @@ On the 2 alpha C compilers I had access to, it was not possible to do ...@@ -15,9 +15,9 @@ On the 2 alpha C compilers I had access to, it was not possible to do
were 64 bits). So the hand assember gives access to the 128 bit result and were 64 bits). So the hand assember gives access to the 128 bit result and
a 2 times speedup :-). a 2 times speedup :-).
There are 2 versions of assember for the HP PA-RISC. There are 3 versions of assember for the HP PA-RISC.
pa-risc.s is the origional one which works fine.
pa-risc2.s is a new version that often generates warnings but if the pa-risc.s is the origional one which works fine and generated using gcc :-)
tests pass, it gives performance that is over 2 times faster than
pa-risc.s. pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations
Both were generated using gcc :-) by Chris Ruemmler from HP (with some help from the HP C compiler).
此差异已折叠。
.SPACE $PRIVATE$
.SUBSPA $DATA$,QUAD=1,ALIGN=8,ACCESS=31
.SUBSPA $BSS$,QUAD=1,ALIGN=8,ACCESS=31,ZERO,SORT=82
.SPACE $TEXT$
.SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=44
.SUBSPA $CODE$,QUAD=0,ALIGN=8,ACCESS=44,CODE_ONLY
.IMPORT $global$,DATA
.IMPORT $$dyncall,MILLICODE
; gcc_compiled.:
.SPACE $TEXT$
.SUBSPA $CODE$
.align 4
.EXPORT bn_mul_add_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR
bn_mul_add_words
.PROC
.CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=4
.ENTRY
stw %r2,-20(0,%r30)
stwm %r4,64(0,%r30)
copy %r24,%r31
stw %r3,-60(0,%r30)
ldi 0,%r20
ldo 12(%r26),%r2
stw %r23,-16(0,%r30)
copy %r25,%r3
ldo 12(%r3),%r1
fldws -16(0,%r30),%fr8L
L$0010
copy %r20,%r25
ldi 0,%r24
fldws 0(0,%r3),%fr9L
ldw 0(0,%r26),%r19
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r19,%r23
ldw -16(0,%r30),%r28
ldw -12(0,%r30),%r29
ldi 0,%r22
add %r23,%r29,%r29
addc %r22,%r28,%r28
add %r25,%r29,%r29
addc %r24,%r28,%r28
copy %r28,%r21
ldi 0,%r20
copy %r21,%r20
addib,= -1,%r31,L$0011
stw %r29,0(0,%r26)
copy %r20,%r25
ldi 0,%r24
fldws -8(0,%r1),%fr9L
ldw -8(0,%r2),%r19
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r19,%r23
ldw -16(0,%r30),%r28
ldw -12(0,%r30),%r29
ldi 0,%r22
add %r23,%r29,%r29
addc %r22,%r28,%r28
add %r25,%r29,%r29
addc %r24,%r28,%r28
copy %r28,%r21
ldi 0,%r20
copy %r21,%r20
addib,= -1,%r31,L$0011
stw %r29,-8(0,%r2)
copy %r20,%r25
ldi 0,%r24
fldws -4(0,%r1),%fr9L
ldw -4(0,%r2),%r19
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r19,%r23
ldw -16(0,%r30),%r28
ldw -12(0,%r30),%r29
ldi 0,%r22
add %r23,%r29,%r29
addc %r22,%r28,%r28
add %r25,%r29,%r29
addc %r24,%r28,%r28
copy %r28,%r21
ldi 0,%r20
copy %r21,%r20
addib,= -1,%r31,L$0011
stw %r29,-4(0,%r2)
copy %r20,%r25
ldi 0,%r24
fldws 0(0,%r1),%fr9L
ldw 0(0,%r2),%r19
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r19,%r23
ldw -16(0,%r30),%r28
ldw -12(0,%r30),%r29
ldi 0,%r22
add %r23,%r29,%r29
addc %r22,%r28,%r28
add %r25,%r29,%r29
addc %r24,%r28,%r28
copy %r28,%r21
ldi 0,%r20
copy %r21,%r20
addib,= -1,%r31,L$0011
stw %r29,0(0,%r2)
ldo 16(%r1),%r1
ldo 16(%r3),%r3
ldo 16(%r2),%r2
bl L$0010,0
ldo 16(%r26),%r26
L$0011
copy %r20,%r28
ldw -84(0,%r30),%r2
ldw -60(0,%r30),%r3
bv 0(%r2)
ldwm -64(0,%r30),%r4
.EXIT
.PROCEND
.align 4
.EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR
bn_mul_words
.PROC
.CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=3
.ENTRY
stw %r2,-20(0,%r30)
copy %r25,%r2
stwm %r4,64(0,%r30)
copy %r24,%r19
ldi 0,%r28
stw %r23,-16(0,%r30)
ldo 12(%r26),%r31
ldo 12(%r2),%r29
fldws -16(0,%r30),%fr8L
L$0026
fldws 0(0,%r2),%fr9L
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r28,%r21
ldi 0,%r20
ldw -16(0,%r30),%r24
ldw -12(0,%r30),%r25
add %r21,%r25,%r25
addc %r20,%r24,%r24
copy %r24,%r23
ldi 0,%r22
copy %r23,%r28
addib,= -1,%r19,L$0027
stw %r25,0(0,%r26)
fldws -8(0,%r29),%fr9L
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r28,%r21
ldi 0,%r20
ldw -16(0,%r30),%r24
ldw -12(0,%r30),%r25
add %r21,%r25,%r25
addc %r20,%r24,%r24
copy %r24,%r23
ldi 0,%r22
copy %r23,%r28
addib,= -1,%r19,L$0027
stw %r25,-8(0,%r31)
fldws -4(0,%r29),%fr9L
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r28,%r21
ldi 0,%r20
ldw -16(0,%r30),%r24
ldw -12(0,%r30),%r25
add %r21,%r25,%r25
addc %r20,%r24,%r24
copy %r24,%r23
ldi 0,%r22
copy %r23,%r28
addib,= -1,%r19,L$0027
stw %r25,-4(0,%r31)
fldws 0(0,%r29),%fr9L
xmpyu %fr8L,%fr9L,%fr9
fstds %fr9,-16(0,%r30)
copy %r28,%r21
ldi 0,%r20
ldw -16(0,%r30),%r24
ldw -12(0,%r30),%r25
add %r21,%r25,%r25
addc %r20,%r24,%r24
copy %r24,%r23
ldi 0,%r22
copy %r23,%r28
addib,= -1,%r19,L$0027
stw %r25,0(0,%r31)
ldo 16(%r29),%r29
ldo 16(%r2),%r2
ldo 16(%r31),%r31
bl L$0026,0
ldo 16(%r26),%r26
L$0027
ldw -84(0,%r30),%r2
bv 0(%r2)
ldwm -64(0,%r30),%r4
.EXIT
.PROCEND
.align 4
.EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
bn_sqr_words
.PROC
.CALLINFO FRAME=0,NO_CALLS
.ENTRY
ldo 28(%r26),%r19
ldo 12(%r25),%r28
L$0042
fldws 0(0,%r25),%fr8L
fldws 0(0,%r25),%fr8R
xmpyu %fr8L,%fr8R,%fr8
fstds %fr8,-16(0,%r30)
ldw -16(0,%r30),%r22
ldw -12(0,%r30),%r23
stw %r23,0(0,%r26)
copy %r22,%r21
ldi 0,%r20
addib,= -1,%r24,L$0049
stw %r21,-24(0,%r19)
fldws -8(0,%r28),%fr8L
fldws -8(0,%r28),%fr8R
xmpyu %fr8L,%fr8R,%fr8
fstds %fr8,-16(0,%r30)
ldw -16(0,%r30),%r22
ldw -12(0,%r30),%r23
stw %r23,-20(0,%r19)
copy %r22,%r21
ldi 0,%r20
addib,= -1,%r24,L$0049
stw %r21,-16(0,%r19)
fldws -4(0,%r28),%fr8L
fldws -4(0,%r28),%fr8R
xmpyu %fr8L,%fr8R,%fr8
fstds %fr8,-16(0,%r30)
ldw -16(0,%r30),%r22
ldw -12(0,%r30),%r23
stw %r23,-12(0,%r19)
copy %r22,%r21
ldi 0,%r20
addib,= -1,%r24,L$0049
stw %r21,-8(0,%r19)
fldws 0(0,%r28),%fr8L
fldws 0(0,%r28),%fr8R
xmpyu %fr8L,%fr8R,%fr8
fstds %fr8,-16(0,%r30)
ldw -16(0,%r30),%r22
ldw -12(0,%r30),%r23
stw %r23,-4(0,%r19)
copy %r22,%r21
ldi 0,%r20
addib,= -1,%r24,L$0049
stw %r21,0(0,%r19)
ldo 16(%r28),%r28
ldo 16(%r25),%r25
ldo 32(%r19),%r19
bl L$0042,0
ldo 32(%r26),%r26
L$0049
bv,n 0(%r2)
.EXIT
.PROCEND
.IMPORT BN_num_bits_word,CODE
.IMPORT fprintf,CODE
.IMPORT __iob,DATA
.SPACE $TEXT$
.SUBSPA $LIT$
.align 4
L$C0000
.STRING "Division would overflow (%d)\x0a\x00"
.IMPORT abort,CODE
.SPACE $TEXT$
.SUBSPA $CODE$
.align 4
.EXPORT bn_div64,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR
bn_div64
.PROC
.CALLINFO FRAME=128,CALLS,SAVE_RP,ENTRY_GR=8
.ENTRY
stw %r2,-20(0,%r30)
stwm %r8,128(0,%r30)
stw %r7,-124(0,%r30)
stw %r4,-112(0,%r30)
stw %r3,-108(0,%r30)
copy %r26,%r3
copy %r25,%r4
stw %r6,-120(0,%r30)
ldi 0,%r7
stw %r5,-116(0,%r30)
movb,<> %r24,%r5,L$0051
ldi 2,%r6
bl L$0068,0
ldi -1,%r28
L$0051
.CALL ARGW0=GR
bl BN_num_bits_word,%r2
copy %r5,%r26
copy %r28,%r24
ldi 32,%r19
comb,= %r19,%r24,L$0052
subi 31,%r24,%r19
mtsar %r19
zvdepi 1,32,%r19
comb,>>= %r19,%r3,L$0052
addil LR'__iob-$global$+32,%r27
ldo RR'__iob-$global$+32(%r1),%r26
ldil LR'L$C0000,%r25
.CALL ARGW0=GR,ARGW1=GR,ARGW2=GR
bl fprintf,%r2
ldo RR'L$C0000(%r25),%r25
.CALL
bl abort,%r2
nop
L$0052
comb,>> %r5,%r3,L$0053
subi 32,%r24,%r24
sub %r3,%r5,%r3
L$0053
comib,= 0,%r24,L$0054
subi 31,%r24,%r19
mtsar %r19
zvdep %r5,32,%r5
zvdep %r3,32,%r21
subi 32,%r24,%r20
mtsar %r20
vshd 0,%r4,%r20
or %r21,%r20,%r3
mtsar %r19
zvdep %r4,32,%r4
L$0054
extru %r5,15,16,%r23
extru %r5,31,16,%r28
L$0055
extru %r3,15,16,%r19
comb,<> %r23,%r19,L$0058
copy %r3,%r26
bl L$0059,0
zdepi -1,31,16,%r29
L$0058
.IMPORT $$divU,MILLICODE
bl $$divU,%r31
copy %r23,%r25
L$0059
stw %r29,-16(0,%r30)
fldws -16(0,%r30),%fr10L
stw %r28,-16(0,%r30)
fldws -16(0,%r30),%fr10R
stw %r23,-16(0,%r30)
xmpyu %fr10L,%fr10R,%fr8
fldws -16(0,%r30),%fr10R
fstws %fr8R,-16(0,%r30)
xmpyu %fr10L,%fr10R,%fr9
ldw -16(0,%r30),%r8
fstws %fr9R,-16(0,%r30)
copy %r8,%r22
ldw -16(0,%r30),%r8
extru %r4,15,16,%r24
copy %r8,%r21
L$0060
sub %r3,%r21,%r20
copy %r20,%r19
depi 0,31,16,%r19
comib,<> 0,%r19,L$0061
zdep %r20,15,16,%r19
addl %r19,%r24,%r19
comb,>>= %r19,%r22,L$0061
sub %r22,%r28,%r22
sub %r21,%r23,%r21
bl L$0060,0
ldo -1(%r29),%r29
L$0061
stw %r29,-16(0,%r30)
fldws -16(0,%r30),%fr10L
stw %r28,-16(0,%r30)
fldws -16(0,%r30),%fr10R
xmpyu %fr10L,%fr10R,%fr8
fstws %fr8R,-16(0,%r30)
ldw -16(0,%r30),%r8
stw %r23,-16(0,%r30)
fldws -16(0,%r30),%fr10R
copy %r8,%r19
xmpyu %fr10L,%fr10R,%fr8
fstws %fr8R,-16(0,%r30)
extru %r19,15,16,%r20
ldw -16(0,%r30),%r8
zdep %r19,15,16,%r19
addl %r8,%r20,%r20
comclr,<<= %r19,%r4,0
addi 1,%r20,%r20
comb,<<= %r20,%r3,L$0066
sub %r4,%r19,%r4
addl %r3,%r5,%r3
ldo -1(%r29),%r29
L$0066
addib,= -1,%r6,L$0056
sub %r3,%r20,%r3
zdep %r29,15,16,%r7
shd %r3,%r4,16,%r3
bl L$0055,0
zdep %r4,15,16,%r4
L$0056
or %r7,%r29,%r28
L$0068
ldw -148(0,%r30),%r2
ldw -124(0,%r30),%r7
ldw -120(0,%r30),%r6
ldw -116(0,%r30),%r5
ldw -112(0,%r30),%r4
ldw -108(0,%r30),%r3
bv 0(%r2)
ldwm -128(0,%r30),%r8
.EXIT
.PROCEND
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册