diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp index 6a8fff0cc9d0576cfa65381d1c18b60bf3483e7a..ec208540ad012c553b437fa98d0c0c9dbc408e88 100644 --- a/src/cpu/x86/vm/assembler_x86.cpp +++ b/src/cpu/x86/vm/assembler_x86.cpp @@ -1575,6 +1575,35 @@ void Assembler::movdqa(Address dst, XMMRegister src) { emit_operand(src, dst); } +void Assembler::movdqu(XMMRegister dst, Address src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionMark im(this); + emit_byte(0xF3); + prefix(src, dst); + emit_byte(0x0F); + emit_byte(0x6F); + emit_operand(dst, src); +} + +void Assembler::movdqu(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + emit_byte(0xF3); + int encode = prefixq_and_encode(dst->encoding(), src->encoding()); + emit_byte(0x0F); + emit_byte(0x6F); + emit_byte(0xC0 | encode); +} + +void Assembler::movdqu(Address dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionMark im(this); + emit_byte(0xF3); + prefix(dst, src); + emit_byte(0x0F); + emit_byte(0x7F); + emit_operand(src, dst); +} + // Uses zero extension on 64bit void Assembler::movl(Register dst, int32_t imm32) { diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp index aea3f108f32e65b4b0cc76c143a084a4bcc1606d..c2b64771803178b502767b5209dc292a24e25aa7 100644 --- a/src/cpu/x86/vm/assembler_x86.hpp +++ b/src/cpu/x86/vm/assembler_x86.hpp @@ -1055,6 +1055,11 @@ private: void movdqa(XMMRegister dst, Address src); void movdqa(XMMRegister dst, XMMRegister src); + // Move Unaligned Double Quadword + void movdqu(Address dst, XMMRegister src); + void movdqu(XMMRegister dst, Address src); + void movdqu(XMMRegister dst, XMMRegister src); + void movl(Register dst, int32_t imm32); void movl(Address dst, int32_t imm32); void movl(Register dst, Register src); diff --git a/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/src/cpu/x86/vm/stubGenerator_x86_32.cpp index bcbc26fbe3677ce456db1c46780efa830de76b52..06435cb0005cdb9174914b185e25d1513aab9fcb 100644 --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp @@ -791,6 +791,69 @@ class StubGenerator: public StubCodeGenerator { } } + + // Copy 64 bytes chunks + // + // Inputs: + // from - source array address + // to_from - destination array address - from + // qword_count - 8-bytes element count, negative + // + void xmm_copy_forward(Register from, Register to_from, Register qword_count) { + assert( UseSSE >= 2, "supported cpu only" ); + Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; + // Copy 64-byte chunks + __ jmpb(L_copy_64_bytes); + __ align(16); + __ BIND(L_copy_64_bytes_loop); + + if(UseUnalignedLoadStores) { + __ movdqu(xmm0, Address(from, 0)); + __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0); + __ movdqu(xmm1, Address(from, 16)); + __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1); + __ movdqu(xmm2, Address(from, 32)); + __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2); + __ movdqu(xmm3, Address(from, 48)); + __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3); + + } else { + __ movq(xmm0, Address(from, 0)); + __ movq(Address(from, to_from, Address::times_1, 0), xmm0); + __ movq(xmm1, Address(from, 8)); + __ movq(Address(from, to_from, Address::times_1, 8), xmm1); + __ movq(xmm2, Address(from, 16)); + __ movq(Address(from, to_from, Address::times_1, 16), xmm2); + __ movq(xmm3, Address(from, 24)); + __ movq(Address(from, to_from, Address::times_1, 24), xmm3); + __ movq(xmm4, Address(from, 32)); + __ movq(Address(from, to_from, Address::times_1, 32), xmm4); + __ movq(xmm5, Address(from, 40)); + __ movq(Address(from, to_from, Address::times_1, 40), xmm5); + __ movq(xmm6, Address(from, 48)); + __ movq(Address(from, to_from, Address::times_1, 48), xmm6); + __ movq(xmm7, Address(from, 56)); + __ movq(Address(from, to_from, Address::times_1, 56), xmm7); + } + + __ addl(from, 64); + __ BIND(L_copy_64_bytes); + __ subl(qword_count, 8); + __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop); + __ addl(qword_count, 8); + __ jccb(Assembler::zero, L_exit); + // + // length is too short, just copy qwords + // + __ BIND(L_copy_8_bytes); + __ movq(xmm0, Address(from, 0)); + __ movq(Address(from, to_from, Address::times_1), xmm0); + __ addl(from, 8); + __ decrement(qword_count); + __ jcc(Assembler::greater, L_copy_8_bytes); + __ BIND(L_exit); + } + // Copy 64 bytes chunks // // Inputs: @@ -799,6 +862,7 @@ class StubGenerator: public StubCodeGenerator { // qword_count - 8-bytes element count, negative // void mmx_copy_forward(Register from, Register to_from, Register qword_count) { + assert( VM_Version::supports_mmx(), "supported cpu only" ); Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; // Copy 64-byte chunks __ jmpb(L_copy_64_bytes); @@ -876,7 +940,7 @@ class StubGenerator: public StubCodeGenerator { __ subptr(to, from); // to --> to_from __ cmpl(count, 2< to_from if (VM_Version::supports_mmx()) { - mmx_copy_forward(from, to_from, count); + if (UseXMMForArrayCopy) { + xmm_copy_forward(from, to_from, count); + } else { + mmx_copy_forward(from, to_from, count); + } } else { __ jmpb(L_copy_8_bytes); __ align(16); @@ -1196,8 +1277,13 @@ class StubGenerator: public StubCodeGenerator { __ align(16); __ BIND(L_copy_8_bytes_loop); if (VM_Version::supports_mmx()) { - __ movq(mmx0, Address(from, count, Address::times_8)); - __ movq(Address(to, count, Address::times_8), mmx0); + if (UseXMMForArrayCopy) { + __ movq(xmm0, Address(from, count, Address::times_8)); + __ movq(Address(to, count, Address::times_8), xmm0); + } else { + __ movq(mmx0, Address(from, count, Address::times_8)); + __ movq(Address(to, count, Address::times_8), mmx0); + } } else { __ fild_d(Address(from, count, Address::times_8)); __ fistp_d(Address(to, count, Address::times_8)); @@ -1206,7 +1292,7 @@ class StubGenerator: public StubCodeGenerator { __ decrement(count); __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); - if (VM_Version::supports_mmx()) { + if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) { __ emms(); } inc_copy_counter_np(T_LONG); diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp index 680167158cc6b3f8fc50387284b8a304e887a3f3..33f6e88ee6d87b0271b0d389d1ef3f0da29fe95a 100644 --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -1251,6 +1251,7 @@ class StubGenerator: public StubCodeGenerator { } } + // Copy big chunks forward // // Inputs: @@ -1268,14 +1269,22 @@ class StubGenerator: public StubCodeGenerator { Label L_loop; __ align(16); __ BIND(L_loop); - __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); - __ movq(Address(end_to, qword_count, Address::times_8, -24), to); - __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); - __ movq(Address(end_to, qword_count, Address::times_8, -16), to); - __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); - __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); - __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); - __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); + if(UseUnalignedLoadStores) { + __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); + __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); + __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8)); + __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1); + + } else { + __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); + __ movq(Address(end_to, qword_count, Address::times_8, -24), to); + __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); + __ movq(Address(end_to, qword_count, Address::times_8, -16), to); + __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); + __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); + __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); + __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); + } __ BIND(L_copy_32_bytes); __ addptr(qword_count, 4); __ jcc(Assembler::lessEqual, L_loop); @@ -1301,14 +1310,22 @@ class StubGenerator: public StubCodeGenerator { Label L_loop; __ align(16); __ BIND(L_loop); - __ movq(to, Address(from, qword_count, Address::times_8, 24)); - __ movq(Address(dest, qword_count, Address::times_8, 24), to); - __ movq(to, Address(from, qword_count, Address::times_8, 16)); - __ movq(Address(dest, qword_count, Address::times_8, 16), to); - __ movq(to, Address(from, qword_count, Address::times_8, 8)); - __ movq(Address(dest, qword_count, Address::times_8, 8), to); - __ movq(to, Address(from, qword_count, Address::times_8, 0)); - __ movq(Address(dest, qword_count, Address::times_8, 0), to); + if(UseUnalignedLoadStores) { + __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); + __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0); + __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); + __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); + + } else { + __ movq(to, Address(from, qword_count, Address::times_8, 24)); + __ movq(Address(dest, qword_count, Address::times_8, 24), to); + __ movq(to, Address(from, qword_count, Address::times_8, 16)); + __ movq(Address(dest, qword_count, Address::times_8, 16), to); + __ movq(to, Address(from, qword_count, Address::times_8, 8)); + __ movq(Address(dest, qword_count, Address::times_8, 8), to); + __ movq(to, Address(from, qword_count, Address::times_8, 0)); + __ movq(Address(dest, qword_count, Address::times_8, 0), to); + } __ BIND(L_copy_32_bytes); __ subptr(qword_count, 4); __ jcc(Assembler::greaterEqual, L_loop); diff --git a/src/cpu/x86/vm/vm_version_x86_32.cpp b/src/cpu/x86/vm/vm_version_x86_32.cpp index 839a4cdaedaeb958b1b7f30ca48c824b29f74519..edd1da4e31fb0bf7de8ee854413c313e4b9e69ab 100644 --- a/src/cpu/x86/vm/vm_version_x86_32.cpp +++ b/src/cpu/x86/vm/vm_version_x86_32.cpp @@ -242,9 +242,11 @@ void VM_Version::get_processor_features() { _supports_cx8 = supports_cmpxchg8(); // if the OS doesn't support SSE, we can't use this feature even if the HW does if( !os::supports_sse()) - _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4|CPU_SSE4A); - if (UseSSE < 4) - _cpuFeatures &= ~CPU_SSE4; + _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4A|CPU_SSE4_1|CPU_SSE4_2); + if (UseSSE < 4) { + _cpuFeatures &= ~CPU_SSE4_1; + _cpuFeatures &= ~CPU_SSE4_2; + } if (UseSSE < 3) { _cpuFeatures &= ~CPU_SSE3; _cpuFeatures &= ~CPU_SSSE3; @@ -261,7 +263,7 @@ void VM_Version::get_processor_features() { } char buf[256]; - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", cores_per_cpu(), threads_per_core(), cpu_family(), _model, _stepping, (supports_cmov() ? ", cmov" : ""), @@ -272,7 +274,8 @@ void VM_Version::get_processor_features() { (supports_sse2() ? ", sse2" : ""), (supports_sse3() ? ", sse3" : ""), (supports_ssse3()? ", ssse3": ""), - (supports_sse4() ? ", sse4" : ""), + (supports_sse4_1() ? ", sse4.1" : ""), + (supports_sse4_2() ? ", sse4.2" : ""), (supports_mmx_ext() ? ", mmxext" : ""), (supports_3dnow() ? ", 3dnow" : ""), (supports_3dnow2() ? ", 3dnowext" : ""), @@ -285,7 +288,7 @@ void VM_Version::get_processor_features() { // older Pentiums which do not support it. if( UseSSE > 4 ) UseSSE=4; if( UseSSE < 0 ) UseSSE=0; - if( !supports_sse4() ) // Drop to 3 if no SSE4 support + if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support UseSSE = MIN2((intx)3,UseSSE); if( !supports_sse3() ) // Drop to 2 if no SSE3 support UseSSE = MIN2((intx)2,UseSSE); @@ -375,6 +378,14 @@ void VM_Version::get_processor_features() { MaxLoopPad = 11; } #endif // COMPILER2 + if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) { + UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus + } + if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus + if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) { + UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus + } + } } } @@ -413,7 +424,7 @@ void VM_Version::get_processor_features() { #ifndef PRODUCT if (PrintMiscellaneous && Verbose) { - tty->print_cr("Logical CPUs per package: %u", + tty->print_cr("Logical CPUs per core: %u", logical_processors_per_package()); tty->print_cr("UseSSE=%d",UseSSE); tty->print("Allocation: "); diff --git a/src/cpu/x86/vm/vm_version_x86_32.hpp b/src/cpu/x86/vm/vm_version_x86_32.hpp index 361bfb2eb11bc18b4f68ce89ea340b6865d46e6f..b37b4f4e516c6a5dfa5b690bce767b68ef2f2f0a 100644 --- a/src/cpu/x86/vm/vm_version_x86_32.hpp +++ b/src/cpu/x86/vm/vm_version_x86_32.hpp @@ -68,9 +68,9 @@ public: cmpxchg16: 1, : 4, dca : 1, - : 4, - popcnt : 1, - : 8; + sse4_1 : 1, + sse4_2 : 1, + : 11; } bits; }; @@ -177,8 +177,9 @@ protected: CPU_SSE2 = (1 << 7), CPU_SSE3 = (1 << 8), // sse3 comes from cpuid 1 (ECX) CPU_SSSE3= (1 << 9), - CPU_SSE4 = (1 <<10), - CPU_SSE4A= (1 <<11) + CPU_SSE4A= (1 <<10), + CPU_SSE4_1 = (1 << 11), + CPU_SSE4_2 = (1 << 12) } cpuFeatureFlags; // cpuid information block. All info derived from executing cpuid with @@ -240,22 +241,14 @@ protected: static CpuidInfo _cpuid_info; // Extractors and predicates - static bool is_extended_cpu_family() { - const uint32_t Extended_Cpu_Family = 0xf; - return _cpuid_info.std_cpuid1_rax.bits.family == Extended_Cpu_Family; - } static uint32_t extended_cpu_family() { uint32_t result = _cpuid_info.std_cpuid1_rax.bits.family; - if (is_extended_cpu_family()) { - result += _cpuid_info.std_cpuid1_rax.bits.ext_family; - } + result += _cpuid_info.std_cpuid1_rax.bits.ext_family; return result; } static uint32_t extended_cpu_model() { uint32_t result = _cpuid_info.std_cpuid1_rax.bits.model; - if (is_extended_cpu_family()) { - result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4; - } + result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4; return result; } static uint32_t cpu_stepping() { @@ -293,6 +286,10 @@ protected: result |= CPU_SSSE3; if (is_amd() && _cpuid_info.ext_cpuid1_rcx.bits.sse4a != 0) result |= CPU_SSE4A; + if (_cpuid_info.std_cpuid1_rcx.bits.sse4_1 != 0) + result |= CPU_SSE4_1; + if (_cpuid_info.std_cpuid1_rcx.bits.sse4_2 != 0) + result |= CPU_SSE4_2; return result; } @@ -380,7 +377,8 @@ public: static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; } static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; } static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; } - static bool supports_sse4() { return (_cpuFeatures & CPU_SSE4) != 0; } + static bool supports_sse4_1() { return (_cpuFeatures & CPU_SSE4_1) != 0; } + static bool supports_sse4_2() { return (_cpuFeatures & CPU_SSE4_2) != 0; } // // AMD features // diff --git a/src/cpu/x86/vm/vm_version_x86_64.cpp b/src/cpu/x86/vm/vm_version_x86_64.cpp index 709d82e6e684c4d14707195d234716efc2cb1a24..7994aab7a7889396a114ee2bdfefa00d606afe39 100644 --- a/src/cpu/x86/vm/vm_version_x86_64.cpp +++ b/src/cpu/x86/vm/vm_version_x86_64.cpp @@ -186,8 +186,10 @@ void VM_Version::get_processor_features() { if (!VM_Version::supports_sse2()) { vm_exit_during_initialization("Unknown x64 processor: SSE2 not supported"); } - if (UseSSE < 4) - _cpuFeatures &= ~CPU_SSE4; + if (UseSSE < 4) { + _cpuFeatures &= ~CPU_SSE4_1; + _cpuFeatures &= ~CPU_SSE4_2; + } if (UseSSE < 3) { _cpuFeatures &= ~CPU_SSE3; _cpuFeatures &= ~CPU_SSSE3; @@ -204,7 +206,7 @@ void VM_Version::get_processor_features() { } char buf[256]; - jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", cores_per_cpu(), threads_per_core(), cpu_family(), _model, _stepping, (supports_cmov() ? ", cmov" : ""), @@ -215,7 +217,8 @@ void VM_Version::get_processor_features() { (supports_sse2() ? ", sse2" : ""), (supports_sse3() ? ", sse3" : ""), (supports_ssse3()? ", ssse3": ""), - (supports_sse4() ? ", sse4" : ""), + (supports_sse4_1() ? ", sse4.1" : ""), + (supports_sse4_2() ? ", sse4.2" : ""), (supports_mmx_ext() ? ", mmxext" : ""), (supports_3dnow() ? ", 3dnow" : ""), (supports_3dnow2() ? ", 3dnowext" : ""), @@ -228,7 +231,7 @@ void VM_Version::get_processor_features() { // older Pentiums which do not support it. if( UseSSE > 4 ) UseSSE=4; if( UseSSE < 0 ) UseSSE=0; - if( !supports_sse4() ) // Drop to 3 if no SSE4 support + if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support UseSSE = MIN2((intx)3,UseSSE); if( !supports_sse3() ) // Drop to 2 if no SSE3 support UseSSE = MIN2((intx)2,UseSSE); @@ -314,6 +317,14 @@ void VM_Version::get_processor_features() { MaxLoopPad = 11; } #endif // COMPILER2 + if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) { + UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus + } + if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus + if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) { + UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus + } + } } } @@ -355,7 +366,7 @@ void VM_Version::get_processor_features() { #ifndef PRODUCT if (PrintMiscellaneous && Verbose) { - tty->print_cr("Logical CPUs per package: %u", + tty->print_cr("Logical CPUs per core: %u", logical_processors_per_package()); tty->print_cr("UseSSE=%d",UseSSE); tty->print("Allocation: "); diff --git a/src/cpu/x86/vm/vm_version_x86_64.hpp b/src/cpu/x86/vm/vm_version_x86_64.hpp index dc60b3700739b93fe1e580fa42b124c713f5c866..37ba2d075588252aad1b7819da0bf36545bcdca0 100644 --- a/src/cpu/x86/vm/vm_version_x86_64.hpp +++ b/src/cpu/x86/vm/vm_version_x86_64.hpp @@ -68,9 +68,9 @@ public: cmpxchg16: 1, : 4, dca : 1, - : 4, - popcnt : 1, - : 8; + sse4_1 : 1, + sse4_2 : 1, + : 11; } bits; }; @@ -177,8 +177,9 @@ protected: CPU_SSE2 = (1 << 7), CPU_SSE3 = (1 << 8), CPU_SSSE3= (1 << 9), - CPU_SSE4 = (1 <<10), - CPU_SSE4A= (1 <<11) + CPU_SSE4A= (1 <<10), + CPU_SSE4_1 = (1 << 11), + CPU_SSE4_2 = (1 << 12) } cpuFeatureFlags; // cpuid information block. All info derived from executing cpuid with @@ -240,22 +241,14 @@ protected: static CpuidInfo _cpuid_info; // Extractors and predicates - static bool is_extended_cpu_family() { - const uint32_t Extended_Cpu_Family = 0xf; - return _cpuid_info.std_cpuid1_eax.bits.family == Extended_Cpu_Family; - } static uint32_t extended_cpu_family() { uint32_t result = _cpuid_info.std_cpuid1_eax.bits.family; - if (is_extended_cpu_family()) { - result += _cpuid_info.std_cpuid1_eax.bits.ext_family; - } + result += _cpuid_info.std_cpuid1_eax.bits.ext_family; return result; } static uint32_t extended_cpu_model() { uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model; - if (is_extended_cpu_family()) { - result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4; - } + result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4; return result; } static uint32_t cpu_stepping() { @@ -293,6 +286,10 @@ protected: result |= CPU_SSSE3; if (is_amd() && _cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0) result |= CPU_SSE4A; + if (_cpuid_info.std_cpuid1_ecx.bits.sse4_1 != 0) + result |= CPU_SSE4_1; + if (_cpuid_info.std_cpuid1_ecx.bits.sse4_2 != 0) + result |= CPU_SSE4_2; return result; } @@ -380,7 +377,8 @@ public: static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; } static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; } static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; } - static bool supports_sse4() { return (_cpuFeatures & CPU_SSE4) != 0; } + static bool supports_sse4_1() { return (_cpuFeatures & CPU_SSE4_1) != 0; } + static bool supports_sse4_2() { return (_cpuFeatures & CPU_SSE4_2) != 0; } // // AMD features // diff --git a/src/cpu/x86/vm/x86_32.ad b/src/cpu/x86/vm/x86_32.ad index a4a4199312d78231d0ca66d8acba697a8ce1e112..589051e06bf2c2d51fa28beb28d1c90ecf6fa133 100644 --- a/src/cpu/x86/vm/x86_32.ad +++ b/src/cpu/x86/vm/x86_32.ad @@ -4810,6 +4810,16 @@ operand immL0() %{ interface(CONST_INTER); %} +// Long Immediate zero +operand immL_M1() %{ + predicate( n->get_long() == -1L ); + match(ConL); + op_cost(0); + + format %{ %} + interface(CONST_INTER); +%} + // Long immediate from 0 to 127. // Used for a shorter form of long mul by 10. operand immL_127() %{ @@ -8621,6 +8631,18 @@ instruct xorI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{ ins_pipe( ialu_reg_reg ); %} +// Xor Register with Immediate -1 +instruct xorI_eReg_im1(eRegI dst, immI_M1 imm) %{ + match(Set dst (XorI dst imm)); + + size(2); + format %{ "NOT $dst" %} + ins_encode %{ + __ notl($dst$$Register); + %} + ins_pipe( ialu_reg ); +%} + // Xor Register with Immediate instruct xorI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{ match(Set dst (XorI dst src)); @@ -8938,6 +8960,18 @@ instruct xorl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{ ins_pipe( ialu_reg_reg_long ); %} +// Xor Long Register with Immediate -1 +instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{ + match(Set dst (XorL dst imm)); + format %{ "NOT $dst.lo\n\t" + "NOT $dst.hi" %} + ins_encode %{ + __ notl($dst$$Register); + __ notl(HIGH_FROM_LOW($dst$$Register)); + %} + ins_pipe( ialu_reg_long ); +%} + // Xor Long Register with Immediate instruct xorl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{ match(Set dst (XorL dst src)); diff --git a/src/cpu/x86/vm/x86_64.ad b/src/cpu/x86/vm/x86_64.ad index 62b46da14b0d2f192cb07f51add80ca1216c22f1..4245284593d7737a92ffb32ee917649555ed9c7c 100644 --- a/src/cpu/x86/vm/x86_64.ad +++ b/src/cpu/x86/vm/x86_64.ad @@ -9309,6 +9309,17 @@ instruct xorI_rReg(rRegI dst, rRegI src, rFlagsReg cr) ins_pipe(ialu_reg_reg); %} +// Xor Register with Immediate -1 +instruct xorI_rReg_im1(rRegI dst, immI_M1 imm) %{ + match(Set dst (XorI dst imm)); + + format %{ "not $dst" %} + ins_encode %{ + __ notl($dst$$Register); + %} + ins_pipe(ialu_reg); +%} + // Xor Register with Immediate instruct xorI_rReg_imm(rRegI dst, immI src, rFlagsReg cr) %{ @@ -9529,6 +9540,17 @@ instruct xorL_rReg(rRegL dst, rRegL src, rFlagsReg cr) ins_pipe(ialu_reg_reg); %} +// Xor Register with Immediate -1 +instruct xorL_rReg_im1(rRegL dst, immL_M1 imm) %{ + match(Set dst (XorL dst imm)); + + format %{ "notq $dst" %} + ins_encode %{ + __ notq($dst$$Register); + %} + ins_pipe(ialu_reg); +%} + // Xor Register with Immediate instruct xorL_rReg_imm(rRegL dst, immL32 src, rFlagsReg cr) %{ diff --git a/src/share/vm/opto/addnode.cpp b/src/share/vm/opto/addnode.cpp index e207e65eaab5cc063d9c8ed972f49e807ebc1033..2ff10cd083d5c1bb2a0c5108b203c5cf213de4ba 100644 --- a/src/share/vm/opto/addnode.cpp +++ b/src/share/vm/opto/addnode.cpp @@ -156,7 +156,8 @@ Node *AddNode::Ideal(PhaseGVN *phase, bool can_reshape) { if( add1_op == this_op && !con_right ) { Node *a12 = add1->in(2); const Type *t12 = phase->type( a12 ); - if( t12->singleton() && t12 != Type::TOP && (add1 != add1->in(1)) ) { + if( t12->singleton() && t12 != Type::TOP && (add1 != add1->in(1)) && + !(add1->in(1)->is_Phi() && add1->in(1)->as_Phi()->is_tripcount()) ) { assert(add1->in(1) != this, "dead loop in AddNode::Ideal"); add2 = add1->clone(); add2->set_req(2, in(2)); @@ -173,7 +174,8 @@ Node *AddNode::Ideal(PhaseGVN *phase, bool can_reshape) { if( add2_op == this_op && !con_left ) { Node *a22 = add2->in(2); const Type *t22 = phase->type( a22 ); - if( t22->singleton() && t22 != Type::TOP && (add2 != add2->in(1)) ) { + if( t22->singleton() && t22 != Type::TOP && (add2 != add2->in(1)) && + !(add2->in(1)->is_Phi() && add2->in(1)->as_Phi()->is_tripcount()) ) { assert(add2->in(1) != this, "dead loop in AddNode::Ideal"); Node *addx = add2->clone(); addx->set_req(1, in(1)); @@ -225,34 +227,63 @@ const Type *AddNode::add_of_identity( const Type *t1, const Type *t2 ) const { //============================================================================= //------------------------------Idealize--------------------------------------- Node *AddINode::Ideal(PhaseGVN *phase, bool can_reshape) { - int op1 = in(1)->Opcode(); - int op2 = in(2)->Opcode(); + Node* in1 = in(1); + Node* in2 = in(2); + int op1 = in1->Opcode(); + int op2 = in2->Opcode(); // Fold (con1-x)+con2 into (con1+con2)-x + if ( op1 == Op_AddI && op2 == Op_SubI ) { + // Swap edges to try optimizations below + in1 = in2; + in2 = in(1); + op1 = op2; + op2 = in2->Opcode(); + } if( op1 == Op_SubI ) { - const Type *t_sub1 = phase->type( in(1)->in(1) ); - const Type *t_2 = phase->type( in(2) ); + const Type *t_sub1 = phase->type( in1->in(1) ); + const Type *t_2 = phase->type( in2 ); if( t_sub1->singleton() && t_2->singleton() && t_sub1 != Type::TOP && t_2 != Type::TOP ) return new (phase->C, 3) SubINode(phase->makecon( add_ring( t_sub1, t_2 ) ), - in(1)->in(2) ); + in1->in(2) ); // Convert "(a-b)+(c-d)" into "(a+c)-(b+d)" if( op2 == Op_SubI ) { // Check for dead cycle: d = (a-b)+(c-d) - assert( in(1)->in(2) != this && in(2)->in(2) != this, + assert( in1->in(2) != this && in2->in(2) != this, "dead loop in AddINode::Ideal" ); Node *sub = new (phase->C, 3) SubINode(NULL, NULL); - sub->init_req(1, phase->transform(new (phase->C, 3) AddINode(in(1)->in(1), in(2)->in(1) ) )); - sub->init_req(2, phase->transform(new (phase->C, 3) AddINode(in(1)->in(2), in(2)->in(2) ) )); + sub->init_req(1, phase->transform(new (phase->C, 3) AddINode(in1->in(1), in2->in(1) ) )); + sub->init_req(2, phase->transform(new (phase->C, 3) AddINode(in1->in(2), in2->in(2) ) )); return sub; } + // Convert "(a-b)+(b+c)" into "(a+c)" + if( op2 == Op_AddI && in1->in(2) == in2->in(1) ) { + assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddINode::Ideal"); + return new (phase->C, 3) AddINode(in1->in(1), in2->in(2)); + } + // Convert "(a-b)+(c+b)" into "(a+c)" + if( op2 == Op_AddI && in1->in(2) == in2->in(2) ) { + assert(in1->in(1) != this && in2->in(1) != this,"dead loop in AddINode::Ideal"); + return new (phase->C, 3) AddINode(in1->in(1), in2->in(1)); + } + // Convert "(a-b)+(b-c)" into "(a-c)" + if( op2 == Op_SubI && in1->in(2) == in2->in(1) ) { + assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddINode::Ideal"); + return new (phase->C, 3) SubINode(in1->in(1), in2->in(2)); + } + // Convert "(a-b)+(c-a)" into "(c-b)" + if( op2 == Op_SubI && in1->in(1) == in2->in(2) ) { + assert(in1->in(2) != this && in2->in(1) != this,"dead loop in AddINode::Ideal"); + return new (phase->C, 3) SubINode(in2->in(1), in1->in(2)); + } } // Convert "x+(0-y)" into "(x-y)" - if( op2 == Op_SubI && phase->type(in(2)->in(1)) == TypeInt::ZERO ) - return new (phase->C, 3) SubINode(in(1), in(2)->in(2) ); + if( op2 == Op_SubI && phase->type(in2->in(1)) == TypeInt::ZERO ) + return new (phase->C, 3) SubINode(in1, in2->in(2) ); // Convert "(0-y)+x" into "(x-y)" - if( op1 == Op_SubI && phase->type(in(1)->in(1)) == TypeInt::ZERO ) - return new (phase->C, 3) SubINode( in(2), in(1)->in(2) ); + if( op1 == Op_SubI && phase->type(in1->in(1)) == TypeInt::ZERO ) + return new (phase->C, 3) SubINode( in2, in1->in(2) ); // Convert (x>>>z)+y into (x+(y<>>z for small constant z and y. // Helps with array allocation math constant folding @@ -266,15 +297,15 @@ Node *AddINode::Ideal(PhaseGVN *phase, bool can_reshape) { // Have not observed cases where type information exists to support // positive y and (x <= -(y << z)) if( op1 == Op_URShiftI && op2 == Op_ConI && - in(1)->in(2)->Opcode() == Op_ConI ) { - jint z = phase->type( in(1)->in(2) )->is_int()->get_con() & 0x1f; // only least significant 5 bits matter - jint y = phase->type( in(2) )->is_int()->get_con(); + in1->in(2)->Opcode() == Op_ConI ) { + jint z = phase->type( in1->in(2) )->is_int()->get_con() & 0x1f; // only least significant 5 bits matter + jint y = phase->type( in2 )->is_int()->get_con(); if( z < 5 && -5 < y && y < 0 ) { - const Type *t_in11 = phase->type(in(1)->in(1)); + const Type *t_in11 = phase->type(in1->in(1)); if( t_in11 != Type::TOP && (t_in11->is_int()->_lo >= -(y << z)) ) { - Node *a = phase->transform( new (phase->C, 3) AddINode( in(1)->in(1), phase->intcon(y<C, 3) URShiftINode( a, in(1)->in(2) ); + Node *a = phase->transform( new (phase->C, 3) AddINode( in1->in(1), phase->intcon(y<C, 3) URShiftINode( a, in1->in(2) ); } } } @@ -328,39 +359,73 @@ const Type *AddINode::add_ring( const Type *t0, const Type *t1 ) const { //============================================================================= //------------------------------Idealize--------------------------------------- Node *AddLNode::Ideal(PhaseGVN *phase, bool can_reshape) { - int op1 = in(1)->Opcode(); - int op2 = in(2)->Opcode(); + Node* in1 = in(1); + Node* in2 = in(2); + int op1 = in1->Opcode(); + int op2 = in2->Opcode(); + // Fold (con1-x)+con2 into (con1+con2)-x + if ( op1 == Op_AddL && op2 == Op_SubL ) { + // Swap edges to try optimizations below + in1 = in2; + in2 = in(1); + op1 = op2; + op2 = in2->Opcode(); + } // Fold (con1-x)+con2 into (con1+con2)-x if( op1 == Op_SubL ) { - const Type *t_sub1 = phase->type( in(1)->in(1) ); - const Type *t_2 = phase->type( in(2) ); + const Type *t_sub1 = phase->type( in1->in(1) ); + const Type *t_2 = phase->type( in2 ); if( t_sub1->singleton() && t_2->singleton() && t_sub1 != Type::TOP && t_2 != Type::TOP ) return new (phase->C, 3) SubLNode(phase->makecon( add_ring( t_sub1, t_2 ) ), - in(1)->in(2) ); + in1->in(2) ); // Convert "(a-b)+(c-d)" into "(a+c)-(b+d)" if( op2 == Op_SubL ) { // Check for dead cycle: d = (a-b)+(c-d) - assert( in(1)->in(2) != this && in(2)->in(2) != this, + assert( in1->in(2) != this && in2->in(2) != this, "dead loop in AddLNode::Ideal" ); Node *sub = new (phase->C, 3) SubLNode(NULL, NULL); - sub->init_req(1, phase->transform(new (phase->C, 3) AddLNode(in(1)->in(1), in(2)->in(1) ) )); - sub->init_req(2, phase->transform(new (phase->C, 3) AddLNode(in(1)->in(2), in(2)->in(2) ) )); + sub->init_req(1, phase->transform(new (phase->C, 3) AddLNode(in1->in(1), in2->in(1) ) )); + sub->init_req(2, phase->transform(new (phase->C, 3) AddLNode(in1->in(2), in2->in(2) ) )); return sub; } + // Convert "(a-b)+(b+c)" into "(a+c)" + if( op2 == Op_AddL && in1->in(2) == in2->in(1) ) { + assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddLNode::Ideal"); + return new (phase->C, 3) AddLNode(in1->in(1), in2->in(2)); + } + // Convert "(a-b)+(c+b)" into "(a+c)" + if( op2 == Op_AddL && in1->in(2) == in2->in(2) ) { + assert(in1->in(1) != this && in2->in(1) != this,"dead loop in AddLNode::Ideal"); + return new (phase->C, 3) AddLNode(in1->in(1), in2->in(1)); + } + // Convert "(a-b)+(b-c)" into "(a-c)" + if( op2 == Op_SubL && in1->in(2) == in2->in(1) ) { + assert(in1->in(1) != this && in2->in(2) != this,"dead loop in AddLNode::Ideal"); + return new (phase->C, 3) SubLNode(in1->in(1), in2->in(2)); + } + // Convert "(a-b)+(c-a)" into "(c-b)" + if( op2 == Op_SubL && in1->in(1) == in1->in(2) ) { + assert(in1->in(2) != this && in2->in(1) != this,"dead loop in AddLNode::Ideal"); + return new (phase->C, 3) SubLNode(in2->in(1), in1->in(2)); + } } // Convert "x+(0-y)" into "(x-y)" - if( op2 == Op_SubL && phase->type(in(2)->in(1)) == TypeLong::ZERO ) - return new (phase->C, 3) SubLNode(in(1), in(2)->in(2) ); + if( op2 == Op_SubL && phase->type(in2->in(1)) == TypeLong::ZERO ) + return new (phase->C, 3) SubLNode( in1, in2->in(2) ); + + // Convert "(0-y)+x" into "(x-y)" + if( op1 == Op_SubL && phase->type(in1->in(1)) == TypeInt::ZERO ) + return new (phase->C, 3) SubLNode( in2, in1->in(2) ); // Convert "X+X+X+X+X...+X+Y" into "k*X+Y" or really convert "X+(X+Y)" // into "(X<<1)+Y" and let shift-folding happen. if( op2 == Op_AddL && - in(2)->in(1) == in(1) && + in2->in(1) == in1 && op1 != Op_ConL && 0 ) { - Node *shift = phase->transform(new (phase->C, 3) LShiftLNode(in(1),phase->intcon(1))); - return new (phase->C, 3) AddLNode(shift,in(2)->in(2)); + Node *shift = phase->transform(new (phase->C, 3) LShiftLNode(in1,phase->intcon(1))); + return new (phase->C, 3) AddLNode(shift,in2->in(2)); } return AddNode::Ideal(phase, can_reshape); diff --git a/src/share/vm/opto/cfgnode.cpp b/src/share/vm/opto/cfgnode.cpp index 60508de55b3cea3af59f70dc7acfdd4ac8b9e844..fdee151f1e995cd1ef134b7041b77028a74e8e5c 100644 --- a/src/share/vm/opto/cfgnode.cpp +++ b/src/share/vm/opto/cfgnode.cpp @@ -1817,6 +1817,12 @@ Node *PhiNode::Ideal(PhaseGVN *phase, bool can_reshape) { return progress; // Return any progress } +//------------------------------is_tripcount----------------------------------- +bool PhiNode::is_tripcount() const { + return (in(0) != NULL && in(0)->is_CountedLoop() && + in(0)->as_CountedLoop()->phi() == this); +} + //------------------------------out_RegMask------------------------------------ const RegMask &PhiNode::in_RegMask(uint i) const { return i ? out_RegMask() : RegMask::Empty; @@ -1832,9 +1838,7 @@ const RegMask &PhiNode::out_RegMask() const { #ifndef PRODUCT void PhiNode::dump_spec(outputStream *st) const { TypeNode::dump_spec(st); - if (in(0) != NULL && - in(0)->is_CountedLoop() && - in(0)->as_CountedLoop()->phi() == this) { + if (is_tripcount()) { st->print(" #tripcount"); } } diff --git a/src/share/vm/opto/cfgnode.hpp b/src/share/vm/opto/cfgnode.hpp index 8fd493a308e047de7e8286624e5b5358c162d7e5..c902d12510acb3d3f0a51837b58a92efec7f1297 100644 --- a/src/share/vm/opto/cfgnode.hpp +++ b/src/share/vm/opto/cfgnode.hpp @@ -162,6 +162,8 @@ public: return NULL; // not a copy! } + bool is_tripcount() const; + // Determine a unique non-trivial input, if any. // Ignore casts if it helps. Return NULL on failure. Node* unique_input(PhaseTransform *phase); diff --git a/src/share/vm/opto/divnode.cpp b/src/share/vm/opto/divnode.cpp index c1c2b5df442e4aec1135e3c250584c7522b1018d..94240873ef51a7b90d246fdfe0b5874f66e9a21a 100644 --- a/src/share/vm/opto/divnode.cpp +++ b/src/share/vm/opto/divnode.cpp @@ -110,10 +110,13 @@ static Node *transform_int_divide( PhaseGVN *phase, Node *dividend, jint divisor } else if( dividend->Opcode() == Op_AndI ) { // An AND mask of sufficient size clears the low bits and // I can avoid rounding. - const TypeInt *andconi = phase->type( dividend->in(2) )->isa_int(); - if( andconi && andconi->is_con(-d) ) { - dividend = dividend->in(1); - needs_rounding = false; + const TypeInt *andconi_t = phase->type( dividend->in(2) )->isa_int(); + if( andconi_t && andconi_t->is_con() ) { + jint andconi = andconi_t->get_con(); + if( andconi < 0 && is_power_of_2(-andconi) && (-andconi) >= d ) { + dividend = dividend->in(1); + needs_rounding = false; + } } } @@ -316,10 +319,13 @@ static Node *transform_long_divide( PhaseGVN *phase, Node *dividend, jlong divis } else if( dividend->Opcode() == Op_AndL ) { // An AND mask of sufficient size clears the low bits and // I can avoid rounding. - const TypeLong *andconl = phase->type( dividend->in(2) )->isa_long(); - if( andconl && andconl->is_con(-d)) { - dividend = dividend->in(1); - needs_rounding = false; + const TypeLong *andconl_t = phase->type( dividend->in(2) )->isa_long(); + if( andconl_t && andconl_t->is_con() ) { + jlong andconl = andconl_t->get_con(); + if( andconl < 0 && is_power_of_2_long(-andconl) && (-andconl) >= d ) { + dividend = dividend->in(1); + needs_rounding = false; + } } } @@ -704,11 +710,18 @@ const Type *DivDNode::Value( PhaseTransform *phase ) const { if( t2 == TypeD::ONE ) return t1; - // If divisor is a constant and not zero, divide them numbers - if( t1->base() == Type::DoubleCon && - t2->base() == Type::DoubleCon && - t2->getd() != 0.0 ) // could be negative zero - return TypeD::make( t1->getd()/t2->getd() ); +#if defined(IA32) + if (!phase->C->method()->is_strict()) + // Can't trust native compilers to properly fold strict double + // division with round-to-zero on this platform. +#endif + { + // If divisor is a constant and not zero, divide them numbers + if( t1->base() == Type::DoubleCon && + t2->base() == Type::DoubleCon && + t2->getd() != 0.0 ) // could be negative zero + return TypeD::make( t1->getd()/t2->getd() ); + } // If the dividend is a constant zero // Note: if t1 and t2 are zero then result is NaN (JVMS page 213) diff --git a/src/share/vm/opto/loopTransform.cpp b/src/share/vm/opto/loopTransform.cpp index 3955f79d14e0c5ed9426573cfb748efec528375a..f1c15b08344bb89896ae2add99a2a21e74edf332 100644 --- a/src/share/vm/opto/loopTransform.cpp +++ b/src/share/vm/opto/loopTransform.cpp @@ -679,6 +679,10 @@ void PhaseIdealLoop::insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_ CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop(); post_head->set_post_loop(main_head); + // Reduce the post-loop trip count. + CountedLoopEndNode* post_end = old_new[main_end ->_idx]->as_CountedLoopEnd(); + post_end->_prob = PROB_FAIR; + // Build the main-loop normal exit. IfFalseNode *new_main_exit = new (C, 1) IfFalseNode(main_end); _igvn.register_new_node_with_optimizer( new_main_exit ); @@ -748,6 +752,9 @@ void PhaseIdealLoop::insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_ pre_head->set_pre_loop(main_head); Node *pre_incr = old_new[incr->_idx]; + // Reduce the pre-loop trip count. + pre_end->_prob = PROB_FAIR; + // Find the pre-loop normal exit. Node* pre_exit = pre_end->proj_out(false); assert( pre_exit->Opcode() == Op_IfFalse, "" ); @@ -767,8 +774,8 @@ void PhaseIdealLoop::insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_ register_new_node( min_cmp , new_pre_exit ); register_new_node( min_bol , new_pre_exit ); - // Build the IfNode - IfNode *min_iff = new (C, 2) IfNode( new_pre_exit, min_bol, PROB_FAIR, COUNT_UNKNOWN ); + // Build the IfNode (assume the main-loop is executed always). + IfNode *min_iff = new (C, 2) IfNode( new_pre_exit, min_bol, PROB_ALWAYS, COUNT_UNKNOWN ); _igvn.register_new_node_with_optimizer( min_iff ); set_idom(min_iff, new_pre_exit, dd_main_head); set_loop(min_iff, loop->_parent); @@ -1583,10 +1590,10 @@ bool IdealLoopTree::policy_do_remove_empty_loop( PhaseIdealLoop *phase ) { //============================================================================= //------------------------------iteration_split_impl--------------------------- -void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ) { +bool IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ) { // Check and remove empty loops (spam micro-benchmarks) if( policy_do_remove_empty_loop(phase) ) - return; // Here we removed an empty loop + return true; // Here we removed an empty loop bool should_peel = policy_peeling(phase); // Should we peel? @@ -1596,7 +1603,8 @@ void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_ // This removes loop-invariant tests (usually null checks). if( !_head->is_CountedLoop() ) { // Non-counted loop if (PartialPeelLoop && phase->partial_peel(this, old_new)) { - return; + // Partial peel succeeded so terminate this round of loop opts + return false; } if( should_peel ) { // Should we peel? #ifndef PRODUCT @@ -1606,14 +1614,14 @@ void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_ } else if( should_unswitch ) { phase->do_unswitching(this, old_new); } - return; + return true; } CountedLoopNode *cl = _head->as_CountedLoop(); - if( !cl->loopexit() ) return; // Ignore various kinds of broken loops + if( !cl->loopexit() ) return true; // Ignore various kinds of broken loops // Do nothing special to pre- and post- loops - if( cl->is_pre_loop() || cl->is_post_loop() ) return; + if( cl->is_pre_loop() || cl->is_post_loop() ) return true; // Compute loop trip count from profile data compute_profile_trip_cnt(phase); @@ -1626,11 +1634,11 @@ void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_ // Here we did some unrolling and peeling. Eventually we will // completely unroll this loop and it will no longer be a loop. phase->do_maximally_unroll(this,old_new); - return; + return true; } if (should_unswitch) { phase->do_unswitching(this, old_new); - return; + return true; } } @@ -1691,14 +1699,16 @@ void IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_ if( should_peel ) // Might want to peel but do nothing else phase->do_peeling(this,old_new); } + return true; } //============================================================================= //------------------------------iteration_split-------------------------------- -void IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new ) { +bool IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new ) { // Recursively iteration split nested loops - if( _child ) _child->iteration_split( phase, old_new ); + if( _child && !_child->iteration_split( phase, old_new )) + return false; // Clean out prior deadwood DCE_loop_body(); @@ -1720,7 +1730,9 @@ void IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new ) _allow_optimizations && !tail()->is_top() ) { // Also ignore the occasional dead backedge if (!_has_call) { - iteration_split_impl( phase, old_new ); + if (!iteration_split_impl( phase, old_new )) { + return false; + } } else if (policy_unswitching(phase)) { phase->do_unswitching(this, old_new); } @@ -1729,5 +1741,7 @@ void IdealLoopTree::iteration_split( PhaseIdealLoop *phase, Node_List &old_new ) // Minor offset re-organization to remove loop-fallout uses of // trip counter. if( _head->is_CountedLoop() ) phase->reorg_offsets( this ); - if( _next ) _next->iteration_split( phase, old_new ); + if( _next && !_next->iteration_split( phase, old_new )) + return false; + return true; } diff --git a/src/share/vm/opto/loopnode.hpp b/src/share/vm/opto/loopnode.hpp index 968cf8653650baa7e5728667522962200e153fd7..5377564669622e3b5b330198f7ecbdf5e6aabb58 100644 --- a/src/share/vm/opto/loopnode.hpp +++ b/src/share/vm/opto/loopnode.hpp @@ -325,12 +325,14 @@ public: // Returns TRUE if loop tree is structurally changed. bool beautify_loops( PhaseIdealLoop *phase ); - // Perform iteration-splitting on inner loops. Split iterations to avoid - // range checks or one-shot null checks. - void iteration_split( PhaseIdealLoop *phase, Node_List &old_new ); - - // Driver for various flavors of iteration splitting - void iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ); + // Perform iteration-splitting on inner loops. Split iterations to + // avoid range checks or one-shot null checks. Returns false if the + // current round of loop opts should stop. + bool iteration_split( PhaseIdealLoop *phase, Node_List &old_new ); + + // Driver for various flavors of iteration splitting. Returns false + // if the current round of loop opts should stop. + bool iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_new ); // Given dominators, try to find loops with calls that must always be // executed (call dominates loop tail). These loops do not need non-call diff --git a/src/share/vm/opto/loopopts.cpp b/src/share/vm/opto/loopopts.cpp index e0f55603b7a7863d4ab922fe7950d03485a884c7..41048cbcbe9e19f4c54a964deb5e8513f19bf9d9 100644 --- a/src/share/vm/opto/loopopts.cpp +++ b/src/share/vm/opto/loopopts.cpp @@ -1903,9 +1903,6 @@ void PhaseIdealLoop::clone_for_use_outside_loop( IdealLoopTree *loop, Node* n, N // Use in a phi is considered a use in the associated predecessor block use_c = use->in(0)->in(j); } - if (use_c->is_CountedLoop()) { - use_c = use_c->in(LoopNode::EntryControl); - } set_ctrl(n_clone, use_c); assert(!loop->is_member(get_loop(use_c)), "should be outside loop"); get_loop(use_c)->_body.push(n_clone); diff --git a/src/share/vm/opto/mulnode.cpp b/src/share/vm/opto/mulnode.cpp index 22ea890c95a451ae34dfc46bd843119a95df7122..5cdcbafbc4b5d8f5d8936decb505c4123e08de93 100644 --- a/src/share/vm/opto/mulnode.cpp +++ b/src/share/vm/opto/mulnode.cpp @@ -152,6 +152,14 @@ const Type *MulNode::Value( PhaseTransform *phase ) const { if( t1 == Type::BOTTOM || t2 == Type::BOTTOM ) return bottom_type(); +#if defined(IA32) + // Can't trust native compilers to properly fold strict double + // multiplication with round-to-zero on this platform. + if (op == Op_MulD && phase->C->method()->is_strict()) { + return TypeD::DOUBLE; + } +#endif + return mul_ring(t1,t2); // Local flavor of type multiplication } @@ -360,7 +368,7 @@ const Type *MulFNode::mul_ring(const Type *t0, const Type *t1) const { // Compute the product type of two double ranges into this node. const Type *MulDNode::mul_ring(const Type *t0, const Type *t1) const { if( t0 == Type::DOUBLE || t1 == Type::DOUBLE ) return Type::DOUBLE; - // We must be adding 2 double constants. + // We must be multiplying 2 double constants. return TypeD::make( t0->getd() * t1->getd() ); } diff --git a/src/share/vm/opto/node.hpp b/src/share/vm/opto/node.hpp index e027265bf67b13808e038556dc74f9af8176733a..f55a403099aa25e94ba75f503542bb0ce6fcf392 100644 --- a/src/share/vm/opto/node.hpp +++ b/src/share/vm/opto/node.hpp @@ -1320,7 +1320,8 @@ public: Node *pop() { if( _clock_index >= size() ) _clock_index = 0; Node *b = at(_clock_index); - map( _clock_index++, Node_List::pop()); + map( _clock_index, Node_List::pop()); + if (size() != 0) _clock_index++; // Always start from 0 _in_worklist >>= b->_idx; return b; } diff --git a/src/share/vm/opto/postaloc.cpp b/src/share/vm/opto/postaloc.cpp index 59e6b14125d5a5ccae8615be10eac1d8ec90a3cd..cd881065f327a899b6faefe466b9e9bbc4e63cf0 100644 --- a/src/share/vm/opto/postaloc.cpp +++ b/src/share/vm/opto/postaloc.cpp @@ -34,7 +34,7 @@ static bool is_single_register(uint x) { #endif } -//------------------------------may_be_copy_of_callee----------------------------- +//---------------------------may_be_copy_of_callee----------------------------- // Check to see if we can possibly be a copy of a callee-save value. bool PhaseChaitin::may_be_copy_of_callee( Node *def ) const { // Short circuit if there are no callee save registers @@ -225,6 +225,20 @@ int PhaseChaitin::elide_copy( Node *n, int k, Block *current_block, Node_List &v // Scan all registers to see if this value is around already for( uint reg = 0; reg < (uint)_max_reg; reg++ ) { + if (reg == (uint)nk_reg) { + // Found ourselves so check if there is only one user of this + // copy and keep on searching for a better copy if so. + bool ignore_self = true; + x = n->in(k); + DUIterator_Fast imax, i = x->fast_outs(imax); + Node* first = x->fast_out(i); i++; + while (i < imax && ignore_self) { + Node* use = x->fast_out(i); i++; + if (use != first) ignore_self = false; + } + if (ignore_self) continue; + } + Node *vv = value[reg]; if( !single ) { // Doubles check for aligned-adjacent pair if( (reg&1)==0 ) continue; // Wrong half of a pair diff --git a/src/share/vm/opto/subnode.cpp b/src/share/vm/opto/subnode.cpp index 774aff9f430fc075ad9ad4ad03f3db9542695d21..260b5dc8af8f097517c20db9a285bf34af576b2d 100644 --- a/src/share/vm/opto/subnode.cpp +++ b/src/share/vm/opto/subnode.cpp @@ -206,6 +206,14 @@ Node *SubINode::Ideal(PhaseGVN *phase, bool can_reshape){ if( op1 == Op_AddI && op2 == Op_AddI && in1->in(2) == in2->in(2) ) return new (phase->C, 3) SubINode( in1->in(1), in2->in(1) ); + // Convert "(A+X) - (X+B)" into "A - B" + if( op1 == Op_AddI && op2 == Op_AddI && in1->in(2) == in2->in(1) ) + return new (phase->C, 3) SubINode( in1->in(1), in2->in(2) ); + + // Convert "(X+A) - (B+X)" into "A - B" + if( op1 == Op_AddI && op2 == Op_AddI && in1->in(1) == in2->in(2) ) + return new (phase->C, 3) SubINode( in1->in(2), in2->in(1) ); + // Convert "A-(B-C)" into (A+C)-B", since add is commutative and generally // nicer to optimize than subtract. if( op2 == Op_SubI && in2->outcnt() == 1) { diff --git a/src/share/vm/runtime/globals.hpp b/src/share/vm/runtime/globals.hpp index 06d43a60d61a7f36c184d3fdcbaf3f502579cbd0..cba2b991403e44aa8927e4d9e1016dffb4955fd1 100644 --- a/src/share/vm/runtime/globals.hpp +++ b/src/share/vm/runtime/globals.hpp @@ -997,6 +997,12 @@ class CommandLineFlags { product(bool, UseXmmI2F, false, \ "Use SSE2 CVTDQ2PS instruction to convert Integer to Float") \ \ + product(bool, UseXMMForArrayCopy, false, \ + "Use SSE2 MOVQ instruction for Arraycopy") \ + \ + product(bool, UseUnalignedLoadStores, false, \ + "Use SSE2 MOVDQU instruction for Arraycopy") \ + \ product(intx, FieldsAllocationStyle, 1, \ "0 - type based with oops first, 1 - with oops last") \ \ @@ -2555,7 +2561,7 @@ class CommandLineFlags { develop(intx, MaxRecursiveInlineLevel, 1, \ "maximum number of nested recursive calls that are inlined") \ \ - develop(intx, InlineSmallCode, 1000, \ + product(intx, InlineSmallCode, 1000, \ "Only inline already compiled methods if their code size is " \ "less than this") \ \ diff --git a/test/compiler/6700047/Test6700047.java b/test/compiler/6700047/Test6700047.java index 55921d59465972c4ac9fcb7cfce56996a31ae298..64e6ddb19330cd416e97ea4c055da9fd9eb5e4c7 100644 --- a/test/compiler/6700047/Test6700047.java +++ b/test/compiler/6700047/Test6700047.java @@ -29,6 +29,8 @@ */ public class Test6700047 { + static byte[] dummy = new byte[256]; + public static void main(String[] args) { for (int i = 0; i < 100000; i++) { intToLeftPaddedAsciiBytes(); @@ -53,6 +55,7 @@ public class Test6700047 { if (offset > 0) { for(int j = 0; j < offset; j++) { result++; + dummy[i] = 0; } } return result;