提交 3ba1ef82 编写于 作者: A Andy Polyakov

bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.
Reviewed-by: NRichard Levitte <levitte@openssl.org>
上级 fe34735c
......@@ -73,27 +73,26 @@ $frame=32; # size of above frame rounded up to 16n
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
&mov ("ebp","esp"); # saved stack pointer!
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
&lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
&lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arraning 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
&mov ("eax","esp");
&mov ("eax","ebp");
&sub ("eax","edx");
&and ("eax",2047);
&sub ("esp","eax"); # this aligns sp and ap modulo 2048
&sub ("ebp","eax"); # this aligns sp and ap modulo 2048
&xor ("edx","esp");
&xor ("edx","ebp");
&and ("edx",2048);
&xor ("edx",2048);
&sub ("esp","edx"); # this splits them apart modulo 4096
&sub ("ebp","edx"); # this splits them apart modulo 4096
&and ("esp",-64); # align to cache line
&and ("ebp",-64); # align to cache line
# An OS-agnostic version of __chkstk.
#
......@@ -103,20 +102,28 @@ $frame=32; # size of above frame rounded up to 16n
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
&mov ("eax","ebp");
&sub ("eax","esp");
&mov ("eax","esp");
&sub ("eax","ebp");
&and ("eax",-4096);
&set_label("page_walk");
&mov ("edx",&DWP(0,"esp","eax"));
&sub ("eax",4096);
&data_byte(0x2e);
&jnc (&label("page_walk"));
&mov ("edx","esp"); # saved stack pointer!
&lea ("esp",&DWP(0,"ebp","eax"));
&mov ("eax",&DWP(0,"esp"));
&cmp ("esp","ebp");
&ja (&label("page_walk"));
&jmp (&label("page_walk_done"));
&set_label("page_walk",16);
&lea ("esp",&DWP(-4096,"esp"));
&mov ("eax",&DWP(0,"esp"));
&cmp ("esp","ebp");
&ja (&label("page_walk"));
&set_label("page_walk_done");
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
&mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
......@@ -124,11 +131,11 @@ $frame=32; # size of above frame rounded up to 16n
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
&mov ($_np,"edx");
&mov ($_np,"ebp");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
&mov ($_sp,"ebp"); # saved stack pointer!
&mov ($_sp,"edx"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
......
......@@ -104,6 +104,8 @@ $code=<<___;
.type bn_mul_mont,\@function,6
.align 16
bn_mul_mont:
mov ${num}d,${num}d
mov %rsp,%rax
test \$3,${num}d
jnz .Lmul_enter
cmp \$8,${num}d
......@@ -128,15 +130,12 @@ $code.=<<___;
push %r14
push %r15
mov ${num}d,${num}d
lea 2($num),%r10
neg $num
mov %rsp,%r11
neg %r10
lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
and \$-1024,%rsp # minimize TLB usage
lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
neg $num # restore $num
and \$-1024,%r10 # minimize TLB usage
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
# An OS-agnostic version of __chkstk.
#
# Some OSes (Windows) insist on stack being "wired" to
......@@ -145,14 +144,24 @@ $code.=<<___;
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
sub %rsp,%r11
sub %r10,%r11
and \$-4096,%r11
lea (%r10,%r11),%rsp
mov (%rsp),%r11
cmp %r10,%rsp
ja .Lmul_page_walk
jmp .Lmul_page_walk_done
.align 16
.Lmul_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x66,0x2e # predict non-taken
jnc .Lmul_page_walk
lea -4096(%rsp),%rsp
mov (%rsp),%r11
cmp %r10,%rsp
ja .Lmul_page_walk
.Lmul_page_walk_done:
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
mov $bp,%r12 # reassign $bp
___
$bp="%r12";
......@@ -323,13 +332,13 @@ $code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Lmul_epilogue:
ret
.size bn_mul_mont,.-bn_mul_mont
......@@ -341,6 +350,8 @@ $code.=<<___;
.type bn_mul4x_mont,\@function,6
.align 16
bn_mul4x_mont:
mov ${num}d,${num}d
mov %rsp,%rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
......@@ -356,23 +367,29 @@ $code.=<<___;
push %r14
push %r15
mov ${num}d,${num}d
lea 4($num),%r10
neg $num
mov %rsp,%r11
neg %r10
lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
and \$-1024,%rsp # minimize TLB usage
lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
neg $num # restore
and \$-1024,%r10 # minimize TLB usage
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul4x_body:
sub %rsp,%r11
sub %r10,%r11
and \$-4096,%r11
lea (%r10,%r11),%rsp
mov (%rsp),%r11
cmp %r10,%rsp
ja .Lmul4x_page_walk
jmp .Lmul4x_page_walk_done
.Lmul4x_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lmul4x_page_walk
lea -4096(%rsp),%rsp
mov (%rsp),%r11
cmp %r10,%rsp
ja .Lmul4x_page_walk
.Lmul4x_page_walk_done:
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul4x_body:
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
mov %rdx,%r12 # reassign $bp
___
......@@ -751,13 +768,13 @@ ___
$code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Lmul4x_epilogue:
ret
.size bn_mul4x_mont,.-bn_mul4x_mont
......@@ -787,14 +804,15 @@ $code.=<<___;
.type bn_sqr8x_mont,\@function,6
.align 32
bn_sqr8x_mont:
.Lsqr8x_enter:
mov %rsp,%rax
.Lsqr8x_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Lsqr8x_prologue:
mov ${num}d,%r10d
shl \$3,${num}d # convert $num to bytes
......@@ -807,33 +825,42 @@ bn_sqr8x_mont:
# do its job.
#
lea -64(%rsp,$num,2),%r11
mov %rsp,%rbp
mov ($n0),$n0 # *n0
sub $aptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lsqr8x_sp_alt
sub %r11,%rsp # align with $aptr
lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
sub %r11,%rbp # align with $aptr
lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rsp
sub %r11,%rbp
.Lsqr8x_sp_done:
and \$-64,%rsp
mov %rax,%r11
sub %rsp,%r11
and \$-64,%rbp
mov %rsp,%r11
sub %rbp,%r11
and \$-4096,%r11
lea (%rbp,%r11),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lsqr8x_page_walk
jmp .Lsqr8x_page_walk_done
.align 16
.Lsqr8x_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lsqr8x_page_walk
lea -4096(%rsp),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lsqr8x_page_walk
.Lsqr8x_page_walk_done:
mov $num,%r10
neg $num
......@@ -957,30 +984,38 @@ $code.=<<___;
.type bn_mulx4x_mont,\@function,6
.align 32
bn_mulx4x_mont:
.Lmulx4x_enter:
mov %rsp,%rax
.Lmulx4x_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Lmulx4x_prologue:
shl \$3,${num}d # convert $num to bytes
.byte 0x67
xor %r10,%r10
sub $num,%r10 # -$num
mov ($n0),$n0 # *n0
lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
and \$-128,%rsp
mov %rax,%r11
sub %rsp,%r11
lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
and \$-128,%rbp
mov %rsp,%r11
sub %rbp,%r11
and \$-4096,%r11
lea (%rbp,%r11),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lmulx4x_page_walk
jmp .Lmulx4x_page_walk_done
.align 16
.Lmulx4x_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x66,0x2e # predict non-taken
jnc .Lmulx4x_page_walk
lea -4096(%rsp),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lmulx4x_page_walk
.Lmulx4x_page_walk_done:
lea ($bp,$num),%r10
##############################################################
......@@ -1341,22 +1376,8 @@ mul_handler:
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
lea 48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
jmp .Lcommon_seh_tail
jmp .Lcommon_pop_regs
.size mul_handler,.-mul_handler
.type sqr_handler,\@abi-omnipotent
......@@ -1384,15 +1405,21 @@ sqr_handler:
cmp %r10,%rbx # context->Rip<.Lsqr_body
jb .Lcommon_seh_tail
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # body label
cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
jb .Lcommon_pop_regs
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
mov 8(%r11),%r10d # HandlerData[2]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
jae .Lcommon_seh_tail
mov 40(%rax),%rax # pull saved stack pointer
.Lcommon_pop_regs:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
......@@ -1479,13 +1506,15 @@ $code.=<<___;
.LSEH_info_bn_sqr8x_mont:
.byte 9,0,0,0
.rva sqr_handler
.rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
.rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
.align 8
___
$code.=<<___ if ($addx);
.LSEH_info_bn_mulx4x_mont:
.byte 9,0,0,0
.rva sqr_handler
.rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
.rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
.align 8
___
}
......
......@@ -93,6 +93,8 @@ $code=<<___;
.type bn_mul_mont_gather5,\@function,6
.align 64
bn_mul_mont_gather5:
mov ${num}d,${num}d
mov %rsp,%rax
test \$7,${num}d
jnz .Lmul_enter
___
......@@ -104,10 +106,7 @@ $code.=<<___;
.align 16
.Lmul_enter:
mov ${num}d,${num}d
mov %rsp,%rax
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
lea .Linc(%rip),%r10
push %rbx
push %rbp
push %r12
......@@ -115,13 +114,12 @@ $code.=<<___;
push %r14
push %r15
lea 2($num),%r11
neg %r11
lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
and \$-1024,%rsp # minimize TLB usage
neg $num
mov %rsp,%r11
lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8)
neg $num # restore $num
and \$-1024,%r10 # minimize TLB usage
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
# An OS-agnostic version of __chkstk.
#
# Some OSes (Windows) insist on stack being "wired" to
......@@ -130,13 +128,24 @@ $code.=<<___;
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
sub %rsp,%rax
and \$-4096,%rax
sub %r10,%r11
and \$-4096,%r11
lea (%r10,%r11),%rsp
mov (%rsp),%r11
cmp %r10,%rsp
ja .Lmul_page_walk
jmp .Lmul_page_walk_done
.Lmul_page_walk:
mov (%rsp,%rax),%r11
sub \$4096,%rax
.byte 0x2e # predict non-taken
jnc .Lmul_page_walk
lea -4096(%rsp),%rsp
mov (%rsp),%r11
cmp %r10,%rsp
ja .Lmul_page_walk
.Lmul_page_walk_done:
lea .Linc(%rip),%r10
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body:
lea 128($bp),%r12 # reassign $bp (+size optimization)
___
......@@ -442,6 +451,8 @@ $code.=<<___;
.type bn_mul4x_mont_gather5,\@function,6
.align 32
bn_mul4x_mont_gather5:
.byte 0x67
mov %rsp,%rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
......@@ -450,14 +461,13 @@ $code.=<<___ if ($addx);
je .Lmulx4x_enter
___
$code.=<<___;
.byte 0x67
mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Lmul4x_prologue:
.byte 0x67
shl \$3,${num}d # convert $num to bytes
......@@ -474,32 +484,40 @@ $code.=<<___;
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
mov %rsp,%rbp
sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmul4xsp_alt
sub %r11,%rsp # align with $rp
lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
sub %r11,%rbp # align with $rp
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
lea 4096-320(,$num,2),%r10
lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rsp
sub %r11,%rbp
.Lmul4xsp_done:
and \$-64,%rsp
mov %rax,%r11
sub %rsp,%r11
and \$-64,%rbp
mov %rsp,%r11
sub %rbp,%r11
and \$-4096,%r11
lea (%rbp,%r11),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lmul4x_page_walk
jmp .Lmul4x_page_walk_done
.Lmul4x_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lmul4x_page_walk
lea -4096(%rsp),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lmul4x_page_walk
.Lmul4x_page_walk_done:
neg $num
......@@ -1043,6 +1061,7 @@ $code.=<<___;
.type bn_power5,\@function,6
.align 32
bn_power5:
mov %rsp,%rax
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
......@@ -1051,13 +1070,13 @@ $code.=<<___ if ($addx);
je .Lpowerx5_enter
___
$code.=<<___;
mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Lpower5_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10d # 3*$num
......@@ -1072,32 +1091,40 @@ $code.=<<___;
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwr_sp_alt
sub %r11,%rsp # align with $aptr
lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
sub %r11,%rbp # align with $aptr
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
lea 4096-320(,$num,2),%r10
lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rsp
sub %r11,%rbp
.Lpwr_sp_done:
and \$-64,%rsp
mov %rax,%r11
sub %rsp,%r11
and \$-64,%rbp
mov %rsp,%r11
sub %rbp,%r11
and \$-4096,%r11
lea (%rbp,%r11),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lpwr_page_walk
jmp .Lpwr_page_walk_done
.Lpwr_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lpwr_page_walk
lea -4096(%rsp),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lpwr_page_walk
.Lpwr_page_walk_done:
mov $num,%r10
neg $num
......@@ -2037,6 +2064,7 @@ bn_from_mont8x:
push %r13
push %r14
push %r15
.Lfrom_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
......@@ -2051,32 +2079,40 @@ bn_from_mont8x:
# last operation, we use the opportunity to cleanse it.
#
lea -320(%rsp,$num,2),%r11
mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lfrom_sp_alt
sub %r11,%rsp # align with $aptr
lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r11,%rbp # align with $aptr
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
lea 4096-320(,$num,2),%r10
lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rsp
sub %r11,%rbp
.Lfrom_sp_done:
and \$-64,%rsp
mov %rax,%r11
sub %rsp,%r11
and \$-64,%rbp
mov %rsp,%r11
sub %rbp,%r11
and \$-4096,%r11
lea (%rbp,%r11),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lfrom_page_walk
jmp .Lfrom_page_walk_done
.Lfrom_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lfrom_page_walk
lea -4096(%rsp),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lfrom_page_walk
.Lfrom_page_walk_done:
mov $num,%r10
neg $num
......@@ -2182,14 +2218,15 @@ $code.=<<___;
.type bn_mulx4x_mont_gather5,\@function,6
.align 32
bn_mulx4x_mont_gather5:
.Lmulx4x_enter:
mov %rsp,%rax
.Lmulx4x_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Lmulx4x_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
......@@ -2206,31 +2243,39 @@ bn_mulx4x_mont_gather5:
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
mov %rsp,%rbp
sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmulx4xsp_alt
sub %r11,%rsp # align with $aptr
lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r11,%rbp # align with $aptr
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lmulx4xsp_done
.Lmulx4xsp_alt:
lea 4096-320(,$num,2),%r10
lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rsp
sub %r11,%rbp
.Lmulx4xsp_done:
and \$-64,%rsp # ensure alignment
mov %rax,%r11
sub %rsp,%r11
and \$-64,%rbp # ensure alignment
mov %rsp,%r11
sub %rbp,%r11
and \$-4096,%r11
lea (%rbp,%r11),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lmulx4x_page_walk
jmp .Lmulx4x_page_walk_done
.Lmulx4x_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lmulx4x_page_walk
lea -4096(%rsp),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lmulx4x_page_walk
.Lmulx4x_page_walk_done:
##############################################################
# Stack layout
......@@ -2638,14 +2683,15 @@ $code.=<<___;
.type bn_powerx5,\@function,6
.align 32
bn_powerx5:
.Lpowerx5_enter:
mov %rsp,%rax
.Lpowerx5_enter:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Lpowerx5_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
......@@ -2660,32 +2706,40 @@ bn_powerx5:
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwrx_sp_alt
sub %r11,%rsp # align with $aptr
lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
sub %r11,%rbp # align with $aptr
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lpwrx_sp_done
.align 32
.Lpwrx_sp_alt:
lea 4096-320(,$num,2),%r10
lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rsp
sub %r11,%rbp
.Lpwrx_sp_done:
and \$-64,%rsp
mov %rax,%r11
sub %rsp,%r11
and \$-64,%rbp
mov %rsp,%r11
sub %rbp,%r11
and \$-4096,%r11
lea (%rbp,%r11),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lpwrx_page_walk
jmp .Lpwrx_page_walk_done
.Lpwrx_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lpwrx_page_walk
lea -4096(%rsp),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lpwrx_page_walk
.Lpwrx_page_walk_done:
mov $num,%r10
neg $num
......@@ -3616,9 +3670,14 @@ mul_handler:
cmp %r10,%rbx # context->Rip<end of prologue label
jb .Lcommon_seh_tail
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jb .Lcommon_pop_regs
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
mov 8(%r11),%r10d # HandlerData[2]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
......@@ -3630,11 +3689,11 @@ mul_handler:
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
jmp .Lbody_proceed
jmp .Lcommon_pop_regs
.Lbody_40:
mov 40(%rax),%rax # pull saved stack pointer
.Lbody_proceed:
.Lcommon_pop_regs:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
......@@ -3725,34 +3784,34 @@ $code.=<<___;
.LSEH_info_bn_mul_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
.rva .Lmul_body,.Lmul_epilogue # HandlerData[]
.rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_mul4x_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
.rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
.rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_power5:
.byte 9,0,0,0
.rva mul_handler
.rva .Lpower5_body,.Lpower5_epilogue # HandlerData[]
.rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_from_mont8x:
.byte 9,0,0,0
.rva mul_handler
.rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
.rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
___
$code.=<<___ if ($addx);
.align 8
.LSEH_info_bn_mulx4x_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
.rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
.rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_powerx5:
.byte 9,0,0,0
.rva mul_handler
.rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
.rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
___
$code.=<<___;
.align 8
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册