提交 b9064221 编写于 作者: A Andy Polyakov

x86[_64]cpuid.pl: handle new extensions.

上级 a3e07010
...@@ -47,7 +47,7 @@ OPENSSL_rdtsc: ...@@ -47,7 +47,7 @@ OPENSSL_rdtsc:
.type OPENSSL_ia32_cpuid,\@abi-omnipotent .type OPENSSL_ia32_cpuid,\@abi-omnipotent
.align 16 .align 16
OPENSSL_ia32_cpuid: OPENSSL_ia32_cpuid:
mov %rbx,%r8 mov %rbx,%r8 # save %rbx
xor %eax,%eax xor %eax,%eax
cpuid cpuid
...@@ -79,7 +79,15 @@ OPENSSL_ia32_cpuid: ...@@ -79,7 +79,15 @@ OPENSSL_ia32_cpuid:
# AMD specific # AMD specific
mov \$0x80000000,%eax mov \$0x80000000,%eax
cpuid cpuid
cmp \$0x80000008,%eax cmp \$0x80000001,%eax
jb .Lintel
mov %eax,%r10d
mov \$0x80000001,%eax
cpuid
or %ecx,%r9d
and \$0x00000801,%r9d # isolate AMD XOP bit, 1<<11
cmp \$0x80000008,%r10d
jb .Lintel jb .Lintel
mov \$0x80000008,%eax mov \$0x80000008,%eax
...@@ -90,12 +98,12 @@ OPENSSL_ia32_cpuid: ...@@ -90,12 +98,12 @@ OPENSSL_ia32_cpuid:
mov \$1,%eax mov \$1,%eax
cpuid cpuid
bt \$28,%edx # test hyper-threading bit bt \$28,%edx # test hyper-threading bit
jnc .Ldone jnc .Lgeneric
shr \$16,%ebx # number of logical processors shr \$16,%ebx # number of logical processors
cmp %r10b,%bl cmp %r10b,%bl
ja .Ldone ja .Lgeneric
and \$0xefffffff,%edx # ~(1<<28) and \$0xefffffff,%edx # ~(1<<28)
jmp .Ldone jmp .Lgeneric
.Lintel: .Lintel:
cmp \$4,%r11d cmp \$4,%r11d
...@@ -121,21 +129,38 @@ OPENSSL_ia32_cpuid: ...@@ -121,21 +129,38 @@ OPENSSL_ia32_cpuid:
or \$0x40000000,%edx # use reserved bit to skip unrolled loop or \$0x40000000,%edx # use reserved bit to skip unrolled loop
.Lnotintel: .Lnotintel:
bt \$28,%edx # test hyper-threading bit bt \$28,%edx # test hyper-threading bit
jnc .Ldone jnc .Lgeneric
and \$0xefffffff,%edx # ~(1<<28) and \$0xefffffff,%edx # ~(1<<28)
cmp \$0,%r10d cmp \$0,%r10d
je .Ldone je .Lgeneric
or \$0x10000000,%edx # 1<<28 or \$0x10000000,%edx # 1<<28
shr \$16,%ebx shr \$16,%ebx
cmp \$1,%bl # see if cache is shared cmp \$1,%bl # see if cache is shared
ja .Ldone ja .Lgeneric
and \$0xefffffff,%edx # ~(1<<28) and \$0xefffffff,%edx # ~(1<<28)
.Ldone: .Lgeneric:
and \$0x00000800,%r9d # isolate AMD XOP flag
and \$0xfffff7ff,%ecx
or %r9d,%ecx # merge AMD XOP flag
shl \$32,%rcx shl \$32,%rcx
mov %edx,%eax mov %edx,%ebx
mov %r8,%rbx or %rcx,%rbx # compose capability vector in %rbx
or %rcx,%rax bt \$27+32,%rcx # check OSXSAVE bit
jnc .Lclear_avx
xor %ecx,%ecx # XCR0
.byte 0x0f,0x01,0xd0 # xgetbv
and \$6,%eax # isolate XMM and YMM state support
cmp \$6,%eax
je .Ldone
.Lclear_avx:
mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11)
shl \$32,%rax
and %rax,%rbx # clear AVX, FMA and AMD XOP bits
.Ldone:
mov %rbx,%rax
mov %r8,%rbx # restore %rbx
ret ret
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid .size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
...@@ -250,7 +275,7 @@ OPENSSL_instrument_bus: ...@@ -250,7 +275,7 @@ OPENSSL_instrument_bus:
mov %eax,$lasttick # lasttick = tick mov %eax,$lasttick # lasttick = tick
mov \$0,$lastdiff # lastdiff = 0 mov \$0,$lastdiff # lastdiff = 0
clflush ($out) clflush ($out)
lock .byte 0xf0 # lock
add $lastdiff,($out) add $lastdiff,($out)
jmp .Loop jmp .Loop
.align 16 .align 16
...@@ -260,7 +285,7 @@ OPENSSL_instrument_bus: ...@@ -260,7 +285,7 @@ OPENSSL_instrument_bus:
mov %edx,$lasttick mov %edx,$lasttick
mov %eax,$lastdiff mov %eax,$lastdiff
clflush ($out) clflush ($out)
lock .byte 0xf0 # lock
add %eax,($out) add %eax,($out)
lea 4($out),$out lea 4($out),$out
sub \$1,$cnt sub \$1,$cnt
...@@ -284,7 +309,7 @@ OPENSSL_instrument_bus2: ...@@ -284,7 +309,7 @@ OPENSSL_instrument_bus2:
mov \$0,$lastdiff # lastdiff = 0 mov \$0,$lastdiff # lastdiff = 0
clflush ($out) clflush ($out)
lock .byte 0xf0 # lock
add $lastdiff,($out) add $lastdiff,($out)
rdtsc # collect 1st diff rdtsc # collect 1st diff
...@@ -294,7 +319,7 @@ OPENSSL_instrument_bus2: ...@@ -294,7 +319,7 @@ OPENSSL_instrument_bus2:
mov %eax,$lastdiff # lastdiff = diff mov %eax,$lastdiff # lastdiff = diff
.Loop2: .Loop2:
clflush ($out) clflush ($out)
lock .byte 0xf0 # lock
add %eax,($out) # accumulate diff add %eax,($out) # accumulate diff
sub \$1,$max sub \$1,$max
......
...@@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } ...@@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&pop ("eax"); &pop ("eax");
&xor ("ecx","eax"); &xor ("ecx","eax");
&bt ("ecx",21); &bt ("ecx",21);
&jnc (&label("done")); &jnc (&label("generic"));
&xor ("eax","eax"); &xor ("eax","eax");
&cpuid (); &cpuid ();
&mov ("edi","eax"); # max value for standard query level &mov ("edi","eax"); # max value for standard query level
...@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } ...@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
# AMD specific # AMD specific
&mov ("eax",0x80000000); &mov ("eax",0x80000000);
&cpuid (); &cpuid ();
&cmp ("eax",0x80000008); &cmp ("eax",0x80000001);
&jb (&label("intel"));
&mov ("esi","eax");
&mov ("eax",0x80000001);
&cpuid ();
&or ("ebp","ecx");
&and ("ebp",1<<11|1); # isolate XOP bit
&cmp ("esi",0x80000008);
&jb (&label("intel")); &jb (&label("intel"));
&mov ("eax",0x80000008); &mov ("eax",0x80000008);
...@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } ...@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&mov ("eax",1); &mov ("eax",1);
&cpuid (); &cpuid ();
&bt ("edx",28); &bt ("edx",28);
&jnc (&label("done")); &jnc (&label("generic"));
&shr ("ebx",16); &shr ("ebx",16);
&and ("ebx",0xff); &and ("ebx",0xff);
&cmp ("ebx","esi"); &cmp ("ebx","esi");
&ja (&label("done")); &ja (&label("generic"));
&and ("edx",0xefffffff); # clear hyper-threading bit &and ("edx",0xefffffff); # clear hyper-threading bit
&jmp (&label("done")); &jmp (&label("generic"));
&set_label("intel"); &set_label("intel");
&cmp ("edi",4); &cmp ("edi",4);
...@@ -93,19 +100,42 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } ...@@ -93,19 +100,42 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&or ("edx",1<<20); # use reserved bit to engage RC4_CHAR &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR
&set_label("notP4"); &set_label("notP4");
&bt ("edx",28); # test hyper-threading bit &bt ("edx",28); # test hyper-threading bit
&jnc (&label("done")); &jnc (&label("generic"));
&and ("edx",0xefffffff); &and ("edx",0xefffffff);
&cmp ("edi",0); &cmp ("edi",0);
&je (&label("done")); &je (&label("generic"));
&or ("edx",0x10000000); &or ("edx",0x10000000);
&shr ("ebx",16); &shr ("ebx",16);
&cmp (&LB("ebx"),1); &cmp (&LB("ebx"),1);
&ja (&label("done")); &ja (&label("generic"));
&and ("edx",0xefffffff); # clear hyper-threading bit if not &and ("edx",0xefffffff); # clear hyper-threading bit if not
&set_label("generic");
&and ("ebp",1<<11); # isolate AMD XOP flag
&and ("ecx",~(1<<11));
&mov ("esi","edx");
&or ("ebp","ecx"); # merge AMD XOP flag
&bt ("ecx",26); # check XSAVE bit
&jnc (&label("done"));
&bt ("ecx",27); # check OSXSAVE bit
&jnc (&label("clear_xmm"));
&xor ("ecx","ecx");
&data_byte(0x0f,0x01,0xd0); # xgetbv
&and ("eax",6);
&cmp ("eax",6);
&je (&label("done"));
&cmp ("eax",2);
&je (&label("clear_avx"));
&set_label("clear_xmm");
&and ("ebp",~(1<<25|1<<1)); # clear AESNI and PCLMULQDQ bits
&and ("esi",~(1<<24)); # clear FXSR
&set_label("clear_avx");
&and ("ebp",~(1<<28|1<<12|1<<11));# clear AVX, FMA and AMD XOP bits
&set_label("done"); &set_label("done");
&mov ("eax","edx"); &mov ("eax","esi");
&mov ("edx","ecx"); &mov ("edx","ebp");
&function_end("OPENSSL_ia32_cpuid"); &function_end("OPENSSL_ia32_cpuid");
&external_label("OPENSSL_ia32cap_P"); &external_label("OPENSSL_ia32cap_P");
...@@ -199,8 +229,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } ...@@ -199,8 +229,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&bt (&DWP(0,"ecx"),1); &bt (&DWP(0,"ecx"),1);
&jnc (&label("no_x87")); &jnc (&label("no_x87"));
if ($sse2) { if ($sse2) {
&bt (&DWP(0,"ecx"),26); &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits
&jnc (&label("no_sse2")); &cmp ("ecx",1<<26|1<<24);
&jne (&label("no_sse2"));
&pxor ("xmm0","xmm0"); &pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1"); &pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2"); &pxor ("xmm2","xmm2");
...@@ -331,7 +362,7 @@ my $max = "ebp"; ...@@ -331,7 +362,7 @@ my $max = "ebp";
&mov ($lasttick,"eax"); # lasttick = tick &mov ($lasttick,"eax"); # lasttick = tick
&mov ($lastdiff,0); # lastdiff = 0 &mov ($lastdiff,0); # lastdiff = 0
&clflush(&DWP(0,$out)); &clflush(&DWP(0,$out));
&lock (); &data_byte(0xf0); # lock
&add (&DWP(0,$out),$lastdiff); &add (&DWP(0,$out),$lastdiff);
&jmp (&label("loop")); &jmp (&label("loop"));
...@@ -342,7 +373,7 @@ my $max = "ebp"; ...@@ -342,7 +373,7 @@ my $max = "ebp";
&mov ($lasttick,"edx"); # lasttick = tick &mov ($lasttick,"edx"); # lasttick = tick
&mov ($lastdiff,"eax"); # lastdiff = diff &mov ($lastdiff,"eax"); # lastdiff = diff
&clflush(&DWP(0,$out)); &clflush(&DWP(0,$out));
&lock (); &data_byte(0xf0); # lock
&add (&DWP(0,$out),"eax"); # accumulate diff &add (&DWP(0,$out),"eax"); # accumulate diff
&lea ($out,&DWP(4,$out)); # ++$out &lea ($out,&DWP(4,$out)); # ++$out
&sub ($cnt,1); # --$cnt &sub ($cnt,1); # --$cnt
...@@ -371,7 +402,7 @@ my $max = "ebp"; ...@@ -371,7 +402,7 @@ my $max = "ebp";
&mov ($lastdiff,0); # lastdiff = 0 &mov ($lastdiff,0); # lastdiff = 0
&clflush(&DWP(0,$out)); &clflush(&DWP(0,$out));
&lock (); &data_byte(0xf0); # lock
&add (&DWP(0,$out),$lastdiff); &add (&DWP(0,$out),$lastdiff);
&rdtsc (); # collect 1st diff &rdtsc (); # collect 1st diff
...@@ -383,7 +414,7 @@ my $max = "ebp"; ...@@ -383,7 +414,7 @@ my $max = "ebp";
&set_label("loop2",16); &set_label("loop2",16);
&clflush(&DWP(0,$out)); &clflush(&DWP(0,$out));
&lock (); &data_byte(0xf0); # lock
&add (&DWP(0,$out),"eax"); # accumulate diff &add (&DWP(0,$out),"eax"); # accumulate diff
&sub ($max,1); &sub ($max,1);
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
=head1 NAME =head1 NAME
OPENSSL_ia32cap - finding the IA-32 processor capabilities OPENSSL_ia32cap - the IA-32 processor capabilities vector
=head1 SYNOPSIS =head1 SYNOPSIS
...@@ -18,30 +18,52 @@ input value (see Intel Application Note #241618). Naturally it's ...@@ -18,30 +18,52 @@ input value (see Intel Application Note #241618). Naturally it's
meaningful on x86 and x86_64 platforms only. The variable is normally meaningful on x86 and x86_64 platforms only. The variable is normally
set up automatically upon toolkit initialization, but can be set up automatically upon toolkit initialization, but can be
manipulated afterwards to modify crypto library behaviour. For the manipulated afterwards to modify crypto library behaviour. For the
moment of this writing seven bits are significant, namely: moment of this writing following bits are significant:
1. bit #4 denoting presence of Time-Stamp Counter. =item bit #4 denoting presence of Time-Stamp Counter.
2. bit #20, reserved by Intel, is used to choose among RC4 code
paths; =item bit #19 denoting availability of CLFLUSH instruction;
3. bit #23 denoting MMX support;
4. bit #25 denoting SSE support; =item bit #20, reserved by Intel, is used to choose among RC4 code paths;
5. bit #26 denoting SSE2 support;
6. bit #28 denoting Hyperthreading, which is used to distiguish =item bit #23 denoting MMX support;
=item bit #24, FXSR bit, denoting availability of XMM registers;
=item bit #25 denoting SSE support;
=item bit #26 denoting SSE2 support;
=item bit #28 denoting Hyperthreading, which is used to distiguish
cores with shared cache; cores with shared cache;
7. bit #30, reserved by Intel, is used to choose among RC4 code
=item bit #30, reserved by Intel, is used to choose among RC4 code
paths; paths;
8. bit #57 denoting Intel AES instruction set extension;
=item bit #33 denoting availability of PCLMULQDQ instruction;
=item bit #41 denoting SSSE3, Supplemental SSE3, support;
=item bit #43 denoting AMD XOP support (forced to zero on Intel);
=item bit #57 denoting AES-NI instruction set extension;
=item bit #59, OSXSAVE bit, denoting availability of YMM registers;
=item bit #60 denoting AVX extension;
For example, clearing bit #26 at run-time disables high-performance For example, clearing bit #26 at run-time disables high-performance
SSE2 code present in the crypto library. You might have to do this if SSE2 code present in the crypto library, while clearing bit #24
target OpenSSL application is executed on SSE2 capable CPU, but under disables SSE2 code operating on 128-bit XMM register bank. You might
control of OS which does not support SSE2 extentions. Even though you have to do the latter if target OpenSSL application is executed on SSE2
can manipulate the value programmatically, you most likely will find it capable CPU, but under control of OS that does not enable XMM
more appropriate to set up an environment variable with the same name registers. Even though you can manipulate the value programmatically,
prior starting target application, e.g. on Intel P4 processor 'env you most likely will find it more appropriate to set up an environment
OPENSSL_ia32cap=0x12900010 apps/openssl', to achieve same effect variable with the same name prior starting target application, e.g. on
without modifying the application source code. Alternatively you can Intel P4 processor 'env OPENSSL_ia32cap=0x16980010 apps/openssl', to
reconfigure the toolkit with no-sse2 option and recompile. achieve same effect without modifying the application source code.
Alternatively you can reconfigure the toolkit with no-sse2 option and
recompile.
Less intuituve is clearing bit #28. The truth is that it's not copied Less intuituve is clearing bit #28. The truth is that it's not copied
from CPUID output verbatim, but is adjusted to reflect whether or not from CPUID output verbatim, but is adjusted to reflect whether or not
...@@ -49,4 +71,3 @@ the data cache is actually shared between logical cores. This in turn ...@@ -49,4 +71,3 @@ the data cache is actually shared between logical cores. This in turn
affects the decision on whether or not expensive countermeasures affects the decision on whether or not expensive countermeasures
against cache-timing attacks are applied, most notably in AES assembler against cache-timing attacks are applied, most notably in AES assembler
module. module.
=cut
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册