diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl index ecfcfc763c7ac68febbc2a4a419ffcb13514feb6..ba95f0b2298d4f3959d7e5c7e9f8abbe6a43bb3f 100644 --- a/crypto/x86_64cpuid.pl +++ b/crypto/x86_64cpuid.pl @@ -47,7 +47,7 @@ OPENSSL_rdtsc: .type OPENSSL_ia32_cpuid,\@abi-omnipotent .align 16 OPENSSL_ia32_cpuid: - mov %rbx,%r8 + mov %rbx,%r8 # save %rbx xor %eax,%eax cpuid @@ -79,7 +79,15 @@ OPENSSL_ia32_cpuid: # AMD specific mov \$0x80000000,%eax cpuid - cmp \$0x80000008,%eax + cmp \$0x80000001,%eax + jb .Lintel + mov %eax,%r10d + mov \$0x80000001,%eax + cpuid + or %ecx,%r9d + and \$0x00000801,%r9d # isolate AMD XOP bit, 1<<11 + + cmp \$0x80000008,%r10d jb .Lintel mov \$0x80000008,%eax @@ -90,12 +98,12 @@ OPENSSL_ia32_cpuid: mov \$1,%eax cpuid bt \$28,%edx # test hyper-threading bit - jnc .Ldone + jnc .Lgeneric shr \$16,%ebx # number of logical processors cmp %r10b,%bl - ja .Ldone + ja .Lgeneric and \$0xefffffff,%edx # ~(1<<28) - jmp .Ldone + jmp .Lgeneric .Lintel: cmp \$4,%r11d @@ -121,21 +129,38 @@ OPENSSL_ia32_cpuid: or \$0x40000000,%edx # use reserved bit to skip unrolled loop .Lnotintel: bt \$28,%edx # test hyper-threading bit - jnc .Ldone + jnc .Lgeneric and \$0xefffffff,%edx # ~(1<<28) cmp \$0,%r10d - je .Ldone + je .Lgeneric or \$0x10000000,%edx # 1<<28 shr \$16,%ebx cmp \$1,%bl # see if cache is shared - ja .Ldone + ja .Lgeneric and \$0xefffffff,%edx # ~(1<<28) -.Ldone: +.Lgeneric: + and \$0x00000800,%r9d # isolate AMD XOP flag + and \$0xfffff7ff,%ecx + or %r9d,%ecx # merge AMD XOP flag + shl \$32,%rcx - mov %edx,%eax - mov %r8,%rbx - or %rcx,%rax + mov %edx,%ebx + or %rcx,%rbx # compose capability vector in %rbx + bt \$27+32,%rcx # check OSXSAVE bit + jnc .Lclear_avx + xor %ecx,%ecx # XCR0 + .byte 0x0f,0x01,0xd0 # xgetbv + and \$6,%eax # isolate XMM and YMM state support + cmp \$6,%eax + je .Ldone +.Lclear_avx: + mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11) + shl \$32,%rax + and %rax,%rbx # clear AVX, FMA and AMD XOP bits +.Ldone: + mov %rbx,%rax + mov %r8,%rbx # restore %rbx ret .size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid @@ -250,7 +275,7 @@ OPENSSL_instrument_bus: mov %eax,$lasttick # lasttick = tick mov \$0,$lastdiff # lastdiff = 0 clflush ($out) - lock + .byte 0xf0 # lock add $lastdiff,($out) jmp .Loop .align 16 @@ -260,7 +285,7 @@ OPENSSL_instrument_bus: mov %edx,$lasttick mov %eax,$lastdiff clflush ($out) - lock + .byte 0xf0 # lock add %eax,($out) lea 4($out),$out sub \$1,$cnt @@ -284,7 +309,7 @@ OPENSSL_instrument_bus2: mov \$0,$lastdiff # lastdiff = 0 clflush ($out) - lock + .byte 0xf0 # lock add $lastdiff,($out) rdtsc # collect 1st diff @@ -294,7 +319,7 @@ OPENSSL_instrument_bus2: mov %eax,$lastdiff # lastdiff = diff .Loop2: clflush ($out) - lock + .byte 0xf0 # lock add %eax,($out) # accumulate diff sub \$1,$max diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl index 0513398739f0dc04a2f8e2c2fa514bee31fe62ee..f424c2debeed80da5038f4ec0740e918322a9a8a 100644 --- a/crypto/x86cpuid.pl +++ b/crypto/x86cpuid.pl @@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &pop ("eax"); &xor ("ecx","eax"); &bt ("ecx",21); - &jnc (&label("done")); + &jnc (&label("generic")); &xor ("eax","eax"); &cpuid (); &mov ("edi","eax"); # max value for standard query level @@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } # AMD specific &mov ("eax",0x80000000); &cpuid (); - &cmp ("eax",0x80000008); + &cmp ("eax",0x80000001); + &jb (&label("intel")); + &mov ("esi","eax"); + &mov ("eax",0x80000001); + &cpuid (); + &or ("ebp","ecx"); + &and ("ebp",1<<11|1); # isolate XOP bit + &cmp ("esi",0x80000008); &jb (&label("intel")); &mov ("eax",0x80000008); @@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &mov ("eax",1); &cpuid (); &bt ("edx",28); - &jnc (&label("done")); + &jnc (&label("generic")); &shr ("ebx",16); &and ("ebx",0xff); &cmp ("ebx","esi"); - &ja (&label("done")); + &ja (&label("generic")); &and ("edx",0xefffffff); # clear hyper-threading bit - &jmp (&label("done")); + &jmp (&label("generic")); &set_label("intel"); &cmp ("edi",4); @@ -93,19 +100,42 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR &set_label("notP4"); &bt ("edx",28); # test hyper-threading bit - &jnc (&label("done")); + &jnc (&label("generic")); &and ("edx",0xefffffff); &cmp ("edi",0); - &je (&label("done")); + &je (&label("generic")); &or ("edx",0x10000000); &shr ("ebx",16); &cmp (&LB("ebx"),1); - &ja (&label("done")); + &ja (&label("generic")); &and ("edx",0xefffffff); # clear hyper-threading bit if not + +&set_label("generic"); + &and ("ebp",1<<11); # isolate AMD XOP flag + &and ("ecx",~(1<<11)); + &mov ("esi","edx"); + &or ("ebp","ecx"); # merge AMD XOP flag + + &bt ("ecx",26); # check XSAVE bit + &jnc (&label("done")); + &bt ("ecx",27); # check OSXSAVE bit + &jnc (&label("clear_xmm")); + &xor ("ecx","ecx"); + &data_byte(0x0f,0x01,0xd0); # xgetbv + &and ("eax",6); + &cmp ("eax",6); + &je (&label("done")); + &cmp ("eax",2); + &je (&label("clear_avx")); +&set_label("clear_xmm"); + &and ("ebp",~(1<<25|1<<1)); # clear AESNI and PCLMULQDQ bits + &and ("esi",~(1<<24)); # clear FXSR +&set_label("clear_avx"); + &and ("ebp",~(1<<28|1<<12|1<<11));# clear AVX, FMA and AMD XOP bits &set_label("done"); - &mov ("eax","edx"); - &mov ("edx","ecx"); + &mov ("eax","esi"); + &mov ("edx","ebp"); &function_end("OPENSSL_ia32_cpuid"); &external_label("OPENSSL_ia32cap_P"); @@ -199,8 +229,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &bt (&DWP(0,"ecx"),1); &jnc (&label("no_x87")); if ($sse2) { - &bt (&DWP(0,"ecx"),26); - &jnc (&label("no_sse2")); + &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits + &cmp ("ecx",1<<26|1<<24); + &jne (&label("no_sse2")); &pxor ("xmm0","xmm0"); &pxor ("xmm1","xmm1"); &pxor ("xmm2","xmm2"); @@ -331,7 +362,7 @@ my $max = "ebp"; &mov ($lasttick,"eax"); # lasttick = tick &mov ($lastdiff,0); # lastdiff = 0 &clflush(&DWP(0,$out)); - &lock (); + &data_byte(0xf0); # lock &add (&DWP(0,$out),$lastdiff); &jmp (&label("loop")); @@ -342,7 +373,7 @@ my $max = "ebp"; &mov ($lasttick,"edx"); # lasttick = tick &mov ($lastdiff,"eax"); # lastdiff = diff &clflush(&DWP(0,$out)); - &lock (); + &data_byte(0xf0); # lock &add (&DWP(0,$out),"eax"); # accumulate diff &lea ($out,&DWP(4,$out)); # ++$out &sub ($cnt,1); # --$cnt @@ -371,7 +402,7 @@ my $max = "ebp"; &mov ($lastdiff,0); # lastdiff = 0 &clflush(&DWP(0,$out)); - &lock (); + &data_byte(0xf0); # lock &add (&DWP(0,$out),$lastdiff); &rdtsc (); # collect 1st diff @@ -383,7 +414,7 @@ my $max = "ebp"; &set_label("loop2",16); &clflush(&DWP(0,$out)); - &lock (); + &data_byte(0xf0); # lock &add (&DWP(0,$out),"eax"); # accumulate diff &sub ($max,1); diff --git a/doc/crypto/OPENSSL_ia32cap.pod b/doc/crypto/OPENSSL_ia32cap.pod index dca2e20aced6af9b526b7eb1eefb7abd9af84251..af6b4f3a4d1d87fa4a16ef76c84c4cd8abb0b8e9 100644 --- a/doc/crypto/OPENSSL_ia32cap.pod +++ b/doc/crypto/OPENSSL_ia32cap.pod @@ -2,7 +2,7 @@ =head1 NAME -OPENSSL_ia32cap - finding the IA-32 processor capabilities +OPENSSL_ia32cap - the IA-32 processor capabilities vector =head1 SYNOPSIS @@ -18,30 +18,52 @@ input value (see Intel Application Note #241618). Naturally it's meaningful on x86 and x86_64 platforms only. The variable is normally set up automatically upon toolkit initialization, but can be manipulated afterwards to modify crypto library behaviour. For the -moment of this writing seven bits are significant, namely: - -1. bit #4 denoting presence of Time-Stamp Counter. -2. bit #20, reserved by Intel, is used to choose among RC4 code - paths; -3. bit #23 denoting MMX support; -4. bit #25 denoting SSE support; -5. bit #26 denoting SSE2 support; -6. bit #28 denoting Hyperthreading, which is used to distiguish - cores with shared cache; -7. bit #30, reserved by Intel, is used to choose among RC4 code - paths; -8. bit #57 denoting Intel AES instruction set extension; +moment of this writing following bits are significant: + +=item bit #4 denoting presence of Time-Stamp Counter. + +=item bit #19 denoting availability of CLFLUSH instruction; + +=item bit #20, reserved by Intel, is used to choose among RC4 code paths; + +=item bit #23 denoting MMX support; + +=item bit #24, FXSR bit, denoting availability of XMM registers; + +=item bit #25 denoting SSE support; + +=item bit #26 denoting SSE2 support; + +=item bit #28 denoting Hyperthreading, which is used to distiguish + cores with shared cache; + +=item bit #30, reserved by Intel, is used to choose among RC4 code + paths; + +=item bit #33 denoting availability of PCLMULQDQ instruction; + +=item bit #41 denoting SSSE3, Supplemental SSE3, support; + +=item bit #43 denoting AMD XOP support (forced to zero on Intel); + +=item bit #57 denoting AES-NI instruction set extension; + +=item bit #59, OSXSAVE bit, denoting availability of YMM registers; + +=item bit #60 denoting AVX extension; For example, clearing bit #26 at run-time disables high-performance -SSE2 code present in the crypto library. You might have to do this if -target OpenSSL application is executed on SSE2 capable CPU, but under -control of OS which does not support SSE2 extentions. Even though you -can manipulate the value programmatically, you most likely will find it -more appropriate to set up an environment variable with the same name -prior starting target application, e.g. on Intel P4 processor 'env -OPENSSL_ia32cap=0x12900010 apps/openssl', to achieve same effect -without modifying the application source code. Alternatively you can -reconfigure the toolkit with no-sse2 option and recompile. +SSE2 code present in the crypto library, while clearing bit #24 +disables SSE2 code operating on 128-bit XMM register bank. You might +have to do the latter if target OpenSSL application is executed on SSE2 +capable CPU, but under control of OS that does not enable XMM +registers. Even though you can manipulate the value programmatically, +you most likely will find it more appropriate to set up an environment +variable with the same name prior starting target application, e.g. on +Intel P4 processor 'env OPENSSL_ia32cap=0x16980010 apps/openssl', to +achieve same effect without modifying the application source code. +Alternatively you can reconfigure the toolkit with no-sse2 option and +recompile. Less intuituve is clearing bit #28. The truth is that it's not copied from CPUID output verbatim, but is adjusted to reflect whether or not @@ -49,4 +71,3 @@ the data cache is actually shared between logical cores. This in turn affects the decision on whether or not expensive countermeasures against cache-timing attacks are applied, most notably in AES assembler module. -=cut