提交 f9dfe7e1 编写于 作者: K kvn

6532536: Optimize arraycopy stubs for Intel cpus

Summary: Use SSE2 movdqu in arraycopy stubs on newest Intel's cpus
Reviewed-by: rasbold
上级 79e27e98
...@@ -1575,6 +1575,35 @@ void Assembler::movdqa(Address dst, XMMRegister src) { ...@@ -1575,6 +1575,35 @@ void Assembler::movdqa(Address dst, XMMRegister src) {
emit_operand(src, dst); emit_operand(src, dst);
} }
void Assembler::movdqu(XMMRegister dst, Address src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
InstructionMark im(this);
emit_byte(0xF3);
prefix(src, dst);
emit_byte(0x0F);
emit_byte(0x6F);
emit_operand(dst, src);
}
void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
emit_byte(0xF3);
int encode = prefixq_and_encode(dst->encoding(), src->encoding());
emit_byte(0x0F);
emit_byte(0x6F);
emit_byte(0xC0 | encode);
}
void Assembler::movdqu(Address dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
InstructionMark im(this);
emit_byte(0xF3);
prefix(dst, src);
emit_byte(0x0F);
emit_byte(0x7F);
emit_operand(src, dst);
}
// Uses zero extension on 64bit // Uses zero extension on 64bit
void Assembler::movl(Register dst, int32_t imm32) { void Assembler::movl(Register dst, int32_t imm32) {
......
...@@ -1055,6 +1055,11 @@ private: ...@@ -1055,6 +1055,11 @@ private:
void movdqa(XMMRegister dst, Address src); void movdqa(XMMRegister dst, Address src);
void movdqa(XMMRegister dst, XMMRegister src); void movdqa(XMMRegister dst, XMMRegister src);
// Move Unaligned Double Quadword
void movdqu(Address dst, XMMRegister src);
void movdqu(XMMRegister dst, Address src);
void movdqu(XMMRegister dst, XMMRegister src);
void movl(Register dst, int32_t imm32); void movl(Register dst, int32_t imm32);
void movl(Address dst, int32_t imm32); void movl(Address dst, int32_t imm32);
void movl(Register dst, Register src); void movl(Register dst, Register src);
......
...@@ -791,6 +791,69 @@ class StubGenerator: public StubCodeGenerator { ...@@ -791,6 +791,69 @@ class StubGenerator: public StubCodeGenerator {
} }
} }
// Copy 64 bytes chunks
//
// Inputs:
// from - source array address
// to_from - destination array address - from
// qword_count - 8-bytes element count, negative
//
void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
assert( UseSSE >= 2, "supported cpu only" );
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
// Copy 64-byte chunks
__ jmpb(L_copy_64_bytes);
__ align(16);
__ BIND(L_copy_64_bytes_loop);
if(UseUnalignedLoadStores) {
__ movdqu(xmm0, Address(from, 0));
__ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
__ movdqu(xmm1, Address(from, 16));
__ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
__ movdqu(xmm2, Address(from, 32));
__ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
__ movdqu(xmm3, Address(from, 48));
__ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
} else {
__ movq(xmm0, Address(from, 0));
__ movq(Address(from, to_from, Address::times_1, 0), xmm0);
__ movq(xmm1, Address(from, 8));
__ movq(Address(from, to_from, Address::times_1, 8), xmm1);
__ movq(xmm2, Address(from, 16));
__ movq(Address(from, to_from, Address::times_1, 16), xmm2);
__ movq(xmm3, Address(from, 24));
__ movq(Address(from, to_from, Address::times_1, 24), xmm3);
__ movq(xmm4, Address(from, 32));
__ movq(Address(from, to_from, Address::times_1, 32), xmm4);
__ movq(xmm5, Address(from, 40));
__ movq(Address(from, to_from, Address::times_1, 40), xmm5);
__ movq(xmm6, Address(from, 48));
__ movq(Address(from, to_from, Address::times_1, 48), xmm6);
__ movq(xmm7, Address(from, 56));
__ movq(Address(from, to_from, Address::times_1, 56), xmm7);
}
__ addl(from, 64);
__ BIND(L_copy_64_bytes);
__ subl(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
__ addl(qword_count, 8);
__ jccb(Assembler::zero, L_exit);
//
// length is too short, just copy qwords
//
__ BIND(L_copy_8_bytes);
__ movq(xmm0, Address(from, 0));
__ movq(Address(from, to_from, Address::times_1), xmm0);
__ addl(from, 8);
__ decrement(qword_count);
__ jcc(Assembler::greater, L_copy_8_bytes);
__ BIND(L_exit);
}
// Copy 64 bytes chunks // Copy 64 bytes chunks
// //
// Inputs: // Inputs:
...@@ -799,6 +862,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -799,6 +862,7 @@ class StubGenerator: public StubCodeGenerator {
// qword_count - 8-bytes element count, negative // qword_count - 8-bytes element count, negative
// //
void mmx_copy_forward(Register from, Register to_from, Register qword_count) { void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
assert( VM_Version::supports_mmx(), "supported cpu only" );
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
// Copy 64-byte chunks // Copy 64-byte chunks
__ jmpb(L_copy_64_bytes); __ jmpb(L_copy_64_bytes);
...@@ -876,7 +940,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -876,7 +940,7 @@ class StubGenerator: public StubCodeGenerator {
__ subptr(to, from); // to --> to_from __ subptr(to, from); // to --> to_from
__ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
__ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
if (!aligned && (t == T_BYTE || t == T_SHORT)) { if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
// align source address at 4 bytes address boundary // align source address at 4 bytes address boundary
if (t == T_BYTE) { if (t == T_BYTE) {
// One byte misalignment happens only for byte arrays // One byte misalignment happens only for byte arrays
...@@ -906,20 +970,26 @@ class StubGenerator: public StubCodeGenerator { ...@@ -906,20 +970,26 @@ class StubGenerator: public StubCodeGenerator {
__ mov(count, rax); // restore 'count' __ mov(count, rax); // restore 'count'
__ jmpb(L_copy_2_bytes); // all dwords were copied __ jmpb(L_copy_2_bytes); // all dwords were copied
} else { } else {
// align to 8 bytes, we know we are 4 byte aligned to start if (!UseUnalignedLoadStores) {
__ testptr(from, 4); // align to 8 bytes, we know we are 4 byte aligned to start
__ jccb(Assembler::zero, L_copy_64_bytes); __ testptr(from, 4);
__ movl(rax, Address(from, 0)); __ jccb(Assembler::zero, L_copy_64_bytes);
__ movl(Address(from, to_from, Address::times_1, 0), rax); __ movl(rax, Address(from, 0));
__ addptr(from, 4); __ movl(Address(from, to_from, Address::times_1, 0), rax);
__ subl(count, 1<<shift); __ addptr(from, 4);
__ subl(count, 1<<shift);
}
__ BIND(L_copy_64_bytes); __ BIND(L_copy_64_bytes);
__ mov(rax, count); __ mov(rax, count);
__ shrl(rax, shift+1); // 8 bytes chunk count __ shrl(rax, shift+1); // 8 bytes chunk count
// //
// Copy 8-byte chunks through MMX registers, 8 per iteration of the loop // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
// //
mmx_copy_forward(from, to_from, rax); if (UseXMMForArrayCopy) {
xmm_copy_forward(from, to_from, rax);
} else {
mmx_copy_forward(from, to_from, rax);
}
} }
// copy tailing dword // copy tailing dword
__ BIND(L_copy_4_bytes); __ BIND(L_copy_4_bytes);
...@@ -1069,13 +1139,20 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1069,13 +1139,20 @@ class StubGenerator: public StubCodeGenerator {
__ align(16); __ align(16);
// Move 8 bytes // Move 8 bytes
__ BIND(L_copy_8_bytes_loop); __ BIND(L_copy_8_bytes_loop);
__ movq(mmx0, Address(from, count, sf, 0)); if (UseXMMForArrayCopy) {
__ movq(Address(to, count, sf, 0), mmx0); __ movq(xmm0, Address(from, count, sf, 0));
__ movq(Address(to, count, sf, 0), xmm0);
} else {
__ movq(mmx0, Address(from, count, sf, 0));
__ movq(Address(to, count, sf, 0), mmx0);
}
__ BIND(L_copy_8_bytes); __ BIND(L_copy_8_bytes);
__ subl(count, 2<<shift); __ subl(count, 2<<shift);
__ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
__ addl(count, 2<<shift); __ addl(count, 2<<shift);
__ emms(); if (!UseXMMForArrayCopy) {
__ emms();
}
} }
__ BIND(L_copy_4_bytes); __ BIND(L_copy_4_bytes);
// copy prefix qword // copy prefix qword
...@@ -1143,7 +1220,11 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1143,7 +1220,11 @@ class StubGenerator: public StubCodeGenerator {
__ subptr(to, from); // to --> to_from __ subptr(to, from); // to --> to_from
if (VM_Version::supports_mmx()) { if (VM_Version::supports_mmx()) {
mmx_copy_forward(from, to_from, count); if (UseXMMForArrayCopy) {
xmm_copy_forward(from, to_from, count);
} else {
mmx_copy_forward(from, to_from, count);
}
} else { } else {
__ jmpb(L_copy_8_bytes); __ jmpb(L_copy_8_bytes);
__ align(16); __ align(16);
...@@ -1196,8 +1277,13 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1196,8 +1277,13 @@ class StubGenerator: public StubCodeGenerator {
__ align(16); __ align(16);
__ BIND(L_copy_8_bytes_loop); __ BIND(L_copy_8_bytes_loop);
if (VM_Version::supports_mmx()) { if (VM_Version::supports_mmx()) {
__ movq(mmx0, Address(from, count, Address::times_8)); if (UseXMMForArrayCopy) {
__ movq(Address(to, count, Address::times_8), mmx0); __ movq(xmm0, Address(from, count, Address::times_8));
__ movq(Address(to, count, Address::times_8), xmm0);
} else {
__ movq(mmx0, Address(from, count, Address::times_8));
__ movq(Address(to, count, Address::times_8), mmx0);
}
} else { } else {
__ fild_d(Address(from, count, Address::times_8)); __ fild_d(Address(from, count, Address::times_8));
__ fistp_d(Address(to, count, Address::times_8)); __ fistp_d(Address(to, count, Address::times_8));
...@@ -1206,7 +1292,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1206,7 +1292,7 @@ class StubGenerator: public StubCodeGenerator {
__ decrement(count); __ decrement(count);
__ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
if (VM_Version::supports_mmx()) { if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
__ emms(); __ emms();
} }
inc_copy_counter_np(T_LONG); inc_copy_counter_np(T_LONG);
......
...@@ -1251,6 +1251,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1251,6 +1251,7 @@ class StubGenerator: public StubCodeGenerator {
} }
} }
// Copy big chunks forward // Copy big chunks forward
// //
// Inputs: // Inputs:
...@@ -1268,14 +1269,22 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1268,14 +1269,22 @@ class StubGenerator: public StubCodeGenerator {
Label L_loop; Label L_loop;
__ align(16); __ align(16);
__ BIND(L_loop); __ BIND(L_loop);
__ movq(to, Address(end_from, qword_count, Address::times_8, -24)); if(UseUnalignedLoadStores) {
__ movq(Address(end_to, qword_count, Address::times_8, -24), to); __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
__ movq(to, Address(end_from, qword_count, Address::times_8, -16)); __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
__ movq(Address(end_to, qword_count, Address::times_8, -16), to); __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
__ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
__ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
__ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); } else {
__ movq(Address(end_to, qword_count, Address::times_8, - 0), to); __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
__ movq(Address(end_to, qword_count, Address::times_8, -24), to);
__ movq(to, Address(end_from, qword_count, Address::times_8, -16));
__ movq(Address(end_to, qword_count, Address::times_8, -16), to);
__ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
__ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
__ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
__ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
}
__ BIND(L_copy_32_bytes); __ BIND(L_copy_32_bytes);
__ addptr(qword_count, 4); __ addptr(qword_count, 4);
__ jcc(Assembler::lessEqual, L_loop); __ jcc(Assembler::lessEqual, L_loop);
...@@ -1301,14 +1310,22 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1301,14 +1310,22 @@ class StubGenerator: public StubCodeGenerator {
Label L_loop; Label L_loop;
__ align(16); __ align(16);
__ BIND(L_loop); __ BIND(L_loop);
__ movq(to, Address(from, qword_count, Address::times_8, 24)); if(UseUnalignedLoadStores) {
__ movq(Address(dest, qword_count, Address::times_8, 24), to); __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
__ movq(to, Address(from, qword_count, Address::times_8, 16)); __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
__ movq(Address(dest, qword_count, Address::times_8, 16), to); __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
__ movq(to, Address(from, qword_count, Address::times_8, 8)); __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
__ movq(Address(dest, qword_count, Address::times_8, 8), to);
__ movq(to, Address(from, qword_count, Address::times_8, 0)); } else {
__ movq(Address(dest, qword_count, Address::times_8, 0), to); __ movq(to, Address(from, qword_count, Address::times_8, 24));
__ movq(Address(dest, qword_count, Address::times_8, 24), to);
__ movq(to, Address(from, qword_count, Address::times_8, 16));
__ movq(Address(dest, qword_count, Address::times_8, 16), to);
__ movq(to, Address(from, qword_count, Address::times_8, 8));
__ movq(Address(dest, qword_count, Address::times_8, 8), to);
__ movq(to, Address(from, qword_count, Address::times_8, 0));
__ movq(Address(dest, qword_count, Address::times_8, 0), to);
}
__ BIND(L_copy_32_bytes); __ BIND(L_copy_32_bytes);
__ subptr(qword_count, 4); __ subptr(qword_count, 4);
__ jcc(Assembler::greaterEqual, L_loop); __ jcc(Assembler::greaterEqual, L_loop);
......
...@@ -242,9 +242,11 @@ void VM_Version::get_processor_features() { ...@@ -242,9 +242,11 @@ void VM_Version::get_processor_features() {
_supports_cx8 = supports_cmpxchg8(); _supports_cx8 = supports_cmpxchg8();
// if the OS doesn't support SSE, we can't use this feature even if the HW does // if the OS doesn't support SSE, we can't use this feature even if the HW does
if( !os::supports_sse()) if( !os::supports_sse())
_cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4|CPU_SSE4A); _cpuFeatures &= ~(CPU_SSE|CPU_SSE2|CPU_SSE3|CPU_SSSE3|CPU_SSE4A|CPU_SSE4_1|CPU_SSE4_2);
if (UseSSE < 4) if (UseSSE < 4) {
_cpuFeatures &= ~CPU_SSE4; _cpuFeatures &= ~CPU_SSE4_1;
_cpuFeatures &= ~CPU_SSE4_2;
}
if (UseSSE < 3) { if (UseSSE < 3) {
_cpuFeatures &= ~CPU_SSE3; _cpuFeatures &= ~CPU_SSE3;
_cpuFeatures &= ~CPU_SSSE3; _cpuFeatures &= ~CPU_SSSE3;
...@@ -261,7 +263,7 @@ void VM_Version::get_processor_features() { ...@@ -261,7 +263,7 @@ void VM_Version::get_processor_features() {
} }
char buf[256]; char buf[256];
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s", jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(), cores_per_cpu(), threads_per_core(),
cpu_family(), _model, _stepping, cpu_family(), _model, _stepping,
(supports_cmov() ? ", cmov" : ""), (supports_cmov() ? ", cmov" : ""),
...@@ -272,7 +274,8 @@ void VM_Version::get_processor_features() { ...@@ -272,7 +274,8 @@ void VM_Version::get_processor_features() {
(supports_sse2() ? ", sse2" : ""), (supports_sse2() ? ", sse2" : ""),
(supports_sse3() ? ", sse3" : ""), (supports_sse3() ? ", sse3" : ""),
(supports_ssse3()? ", ssse3": ""), (supports_ssse3()? ", ssse3": ""),
(supports_sse4() ? ", sse4" : ""), (supports_sse4_1() ? ", sse4.1" : ""),
(supports_sse4_2() ? ", sse4.2" : ""),
(supports_mmx_ext() ? ", mmxext" : ""), (supports_mmx_ext() ? ", mmxext" : ""),
(supports_3dnow() ? ", 3dnow" : ""), (supports_3dnow() ? ", 3dnow" : ""),
(supports_3dnow2() ? ", 3dnowext" : ""), (supports_3dnow2() ? ", 3dnowext" : ""),
...@@ -285,7 +288,7 @@ void VM_Version::get_processor_features() { ...@@ -285,7 +288,7 @@ void VM_Version::get_processor_features() {
// older Pentiums which do not support it. // older Pentiums which do not support it.
if( UseSSE > 4 ) UseSSE=4; if( UseSSE > 4 ) UseSSE=4;
if( UseSSE < 0 ) UseSSE=0; if( UseSSE < 0 ) UseSSE=0;
if( !supports_sse4() ) // Drop to 3 if no SSE4 support if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
UseSSE = MIN2((intx)3,UseSSE); UseSSE = MIN2((intx)3,UseSSE);
if( !supports_sse3() ) // Drop to 2 if no SSE3 support if( !supports_sse3() ) // Drop to 2 if no SSE3 support
UseSSE = MIN2((intx)2,UseSSE); UseSSE = MIN2((intx)2,UseSSE);
...@@ -375,6 +378,14 @@ void VM_Version::get_processor_features() { ...@@ -375,6 +378,14 @@ void VM_Version::get_processor_features() {
MaxLoopPad = 11; MaxLoopPad = 11;
} }
#endif // COMPILER2 #endif // COMPILER2
if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
}
if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
}
}
} }
} }
...@@ -413,7 +424,7 @@ void VM_Version::get_processor_features() { ...@@ -413,7 +424,7 @@ void VM_Version::get_processor_features() {
#ifndef PRODUCT #ifndef PRODUCT
if (PrintMiscellaneous && Verbose) { if (PrintMiscellaneous && Verbose) {
tty->print_cr("Logical CPUs per package: %u", tty->print_cr("Logical CPUs per core: %u",
logical_processors_per_package()); logical_processors_per_package());
tty->print_cr("UseSSE=%d",UseSSE); tty->print_cr("UseSSE=%d",UseSSE);
tty->print("Allocation: "); tty->print("Allocation: ");
......
...@@ -68,9 +68,9 @@ public: ...@@ -68,9 +68,9 @@ public:
cmpxchg16: 1, cmpxchg16: 1,
: 4, : 4,
dca : 1, dca : 1,
: 4, sse4_1 : 1,
popcnt : 1, sse4_2 : 1,
: 8; : 11;
} bits; } bits;
}; };
...@@ -177,8 +177,9 @@ protected: ...@@ -177,8 +177,9 @@ protected:
CPU_SSE2 = (1 << 7), CPU_SSE2 = (1 << 7),
CPU_SSE3 = (1 << 8), // sse3 comes from cpuid 1 (ECX) CPU_SSE3 = (1 << 8), // sse3 comes from cpuid 1 (ECX)
CPU_SSSE3= (1 << 9), CPU_SSSE3= (1 << 9),
CPU_SSE4 = (1 <<10), CPU_SSE4A= (1 <<10),
CPU_SSE4A= (1 <<11) CPU_SSE4_1 = (1 << 11),
CPU_SSE4_2 = (1 << 12)
} cpuFeatureFlags; } cpuFeatureFlags;
// cpuid information block. All info derived from executing cpuid with // cpuid information block. All info derived from executing cpuid with
...@@ -240,22 +241,14 @@ protected: ...@@ -240,22 +241,14 @@ protected:
static CpuidInfo _cpuid_info; static CpuidInfo _cpuid_info;
// Extractors and predicates // Extractors and predicates
static bool is_extended_cpu_family() {
const uint32_t Extended_Cpu_Family = 0xf;
return _cpuid_info.std_cpuid1_rax.bits.family == Extended_Cpu_Family;
}
static uint32_t extended_cpu_family() { static uint32_t extended_cpu_family() {
uint32_t result = _cpuid_info.std_cpuid1_rax.bits.family; uint32_t result = _cpuid_info.std_cpuid1_rax.bits.family;
if (is_extended_cpu_family()) { result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
result += _cpuid_info.std_cpuid1_rax.bits.ext_family;
}
return result; return result;
} }
static uint32_t extended_cpu_model() { static uint32_t extended_cpu_model() {
uint32_t result = _cpuid_info.std_cpuid1_rax.bits.model; uint32_t result = _cpuid_info.std_cpuid1_rax.bits.model;
if (is_extended_cpu_family()) { result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
result |= _cpuid_info.std_cpuid1_rax.bits.ext_model << 4;
}
return result; return result;
} }
static uint32_t cpu_stepping() { static uint32_t cpu_stepping() {
...@@ -293,6 +286,10 @@ protected: ...@@ -293,6 +286,10 @@ protected:
result |= CPU_SSSE3; result |= CPU_SSSE3;
if (is_amd() && _cpuid_info.ext_cpuid1_rcx.bits.sse4a != 0) if (is_amd() && _cpuid_info.ext_cpuid1_rcx.bits.sse4a != 0)
result |= CPU_SSE4A; result |= CPU_SSE4A;
if (_cpuid_info.std_cpuid1_rcx.bits.sse4_1 != 0)
result |= CPU_SSE4_1;
if (_cpuid_info.std_cpuid1_rcx.bits.sse4_2 != 0)
result |= CPU_SSE4_2;
return result; return result;
} }
...@@ -380,7 +377,8 @@ public: ...@@ -380,7 +377,8 @@ public:
static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; } static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; }
static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; } static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; }
static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; } static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; }
static bool supports_sse4() { return (_cpuFeatures & CPU_SSE4) != 0; } static bool supports_sse4_1() { return (_cpuFeatures & CPU_SSE4_1) != 0; }
static bool supports_sse4_2() { return (_cpuFeatures & CPU_SSE4_2) != 0; }
// //
// AMD features // AMD features
// //
......
...@@ -186,8 +186,10 @@ void VM_Version::get_processor_features() { ...@@ -186,8 +186,10 @@ void VM_Version::get_processor_features() {
if (!VM_Version::supports_sse2()) { if (!VM_Version::supports_sse2()) {
vm_exit_during_initialization("Unknown x64 processor: SSE2 not supported"); vm_exit_during_initialization("Unknown x64 processor: SSE2 not supported");
} }
if (UseSSE < 4) if (UseSSE < 4) {
_cpuFeatures &= ~CPU_SSE4; _cpuFeatures &= ~CPU_SSE4_1;
_cpuFeatures &= ~CPU_SSE4_2;
}
if (UseSSE < 3) { if (UseSSE < 3) {
_cpuFeatures &= ~CPU_SSE3; _cpuFeatures &= ~CPU_SSE3;
_cpuFeatures &= ~CPU_SSSE3; _cpuFeatures &= ~CPU_SSSE3;
...@@ -204,7 +206,7 @@ void VM_Version::get_processor_features() { ...@@ -204,7 +206,7 @@ void VM_Version::get_processor_features() {
} }
char buf[256]; char buf[256];
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s", jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(), cores_per_cpu(), threads_per_core(),
cpu_family(), _model, _stepping, cpu_family(), _model, _stepping,
(supports_cmov() ? ", cmov" : ""), (supports_cmov() ? ", cmov" : ""),
...@@ -215,7 +217,8 @@ void VM_Version::get_processor_features() { ...@@ -215,7 +217,8 @@ void VM_Version::get_processor_features() {
(supports_sse2() ? ", sse2" : ""), (supports_sse2() ? ", sse2" : ""),
(supports_sse3() ? ", sse3" : ""), (supports_sse3() ? ", sse3" : ""),
(supports_ssse3()? ", ssse3": ""), (supports_ssse3()? ", ssse3": ""),
(supports_sse4() ? ", sse4" : ""), (supports_sse4_1() ? ", sse4.1" : ""),
(supports_sse4_2() ? ", sse4.2" : ""),
(supports_mmx_ext() ? ", mmxext" : ""), (supports_mmx_ext() ? ", mmxext" : ""),
(supports_3dnow() ? ", 3dnow" : ""), (supports_3dnow() ? ", 3dnow" : ""),
(supports_3dnow2() ? ", 3dnowext" : ""), (supports_3dnow2() ? ", 3dnowext" : ""),
...@@ -228,7 +231,7 @@ void VM_Version::get_processor_features() { ...@@ -228,7 +231,7 @@ void VM_Version::get_processor_features() {
// older Pentiums which do not support it. // older Pentiums which do not support it.
if( UseSSE > 4 ) UseSSE=4; if( UseSSE > 4 ) UseSSE=4;
if( UseSSE < 0 ) UseSSE=0; if( UseSSE < 0 ) UseSSE=0;
if( !supports_sse4() ) // Drop to 3 if no SSE4 support if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
UseSSE = MIN2((intx)3,UseSSE); UseSSE = MIN2((intx)3,UseSSE);
if( !supports_sse3() ) // Drop to 2 if no SSE3 support if( !supports_sse3() ) // Drop to 2 if no SSE3 support
UseSSE = MIN2((intx)2,UseSSE); UseSSE = MIN2((intx)2,UseSSE);
...@@ -314,6 +317,14 @@ void VM_Version::get_processor_features() { ...@@ -314,6 +317,14 @@ void VM_Version::get_processor_features() {
MaxLoopPad = 11; MaxLoopPad = 11;
} }
#endif // COMPILER2 #endif // COMPILER2
if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
}
if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
}
}
} }
} }
...@@ -355,7 +366,7 @@ void VM_Version::get_processor_features() { ...@@ -355,7 +366,7 @@ void VM_Version::get_processor_features() {
#ifndef PRODUCT #ifndef PRODUCT
if (PrintMiscellaneous && Verbose) { if (PrintMiscellaneous && Verbose) {
tty->print_cr("Logical CPUs per package: %u", tty->print_cr("Logical CPUs per core: %u",
logical_processors_per_package()); logical_processors_per_package());
tty->print_cr("UseSSE=%d",UseSSE); tty->print_cr("UseSSE=%d",UseSSE);
tty->print("Allocation: "); tty->print("Allocation: ");
......
...@@ -68,9 +68,9 @@ public: ...@@ -68,9 +68,9 @@ public:
cmpxchg16: 1, cmpxchg16: 1,
: 4, : 4,
dca : 1, dca : 1,
: 4, sse4_1 : 1,
popcnt : 1, sse4_2 : 1,
: 8; : 11;
} bits; } bits;
}; };
...@@ -177,8 +177,9 @@ protected: ...@@ -177,8 +177,9 @@ protected:
CPU_SSE2 = (1 << 7), CPU_SSE2 = (1 << 7),
CPU_SSE3 = (1 << 8), CPU_SSE3 = (1 << 8),
CPU_SSSE3= (1 << 9), CPU_SSSE3= (1 << 9),
CPU_SSE4 = (1 <<10), CPU_SSE4A= (1 <<10),
CPU_SSE4A= (1 <<11) CPU_SSE4_1 = (1 << 11),
CPU_SSE4_2 = (1 << 12)
} cpuFeatureFlags; } cpuFeatureFlags;
// cpuid information block. All info derived from executing cpuid with // cpuid information block. All info derived from executing cpuid with
...@@ -240,22 +241,14 @@ protected: ...@@ -240,22 +241,14 @@ protected:
static CpuidInfo _cpuid_info; static CpuidInfo _cpuid_info;
// Extractors and predicates // Extractors and predicates
static bool is_extended_cpu_family() {
const uint32_t Extended_Cpu_Family = 0xf;
return _cpuid_info.std_cpuid1_eax.bits.family == Extended_Cpu_Family;
}
static uint32_t extended_cpu_family() { static uint32_t extended_cpu_family() {
uint32_t result = _cpuid_info.std_cpuid1_eax.bits.family; uint32_t result = _cpuid_info.std_cpuid1_eax.bits.family;
if (is_extended_cpu_family()) { result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
}
return result; return result;
} }
static uint32_t extended_cpu_model() { static uint32_t extended_cpu_model() {
uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model; uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model;
if (is_extended_cpu_family()) { result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
}
return result; return result;
} }
static uint32_t cpu_stepping() { static uint32_t cpu_stepping() {
...@@ -293,6 +286,10 @@ protected: ...@@ -293,6 +286,10 @@ protected:
result |= CPU_SSSE3; result |= CPU_SSSE3;
if (is_amd() && _cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0) if (is_amd() && _cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0)
result |= CPU_SSE4A; result |= CPU_SSE4A;
if (_cpuid_info.std_cpuid1_ecx.bits.sse4_1 != 0)
result |= CPU_SSE4_1;
if (_cpuid_info.std_cpuid1_ecx.bits.sse4_2 != 0)
result |= CPU_SSE4_2;
return result; return result;
} }
...@@ -380,7 +377,8 @@ public: ...@@ -380,7 +377,8 @@ public:
static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; } static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; }
static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; } static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; }
static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; } static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; }
static bool supports_sse4() { return (_cpuFeatures & CPU_SSE4) != 0; } static bool supports_sse4_1() { return (_cpuFeatures & CPU_SSE4_1) != 0; }
static bool supports_sse4_2() { return (_cpuFeatures & CPU_SSE4_2) != 0; }
// //
// AMD features // AMD features
// //
......
...@@ -3758,7 +3758,7 @@ int set_lwp_priority (int ThreadID, int lwpid, int newPrio ) ...@@ -3758,7 +3758,7 @@ int set_lwp_priority (int ThreadID, int lwpid, int newPrio )
int maxClamped = MIN2(iaLimits.maxPrio, (int)iaInfo->ia_uprilim); int maxClamped = MIN2(iaLimits.maxPrio, (int)iaInfo->ia_uprilim);
iaInfo->ia_upri = scale_to_lwp_priority(iaLimits.minPrio, maxClamped, newPrio); iaInfo->ia_upri = scale_to_lwp_priority(iaLimits.minPrio, maxClamped, newPrio);
iaInfo->ia_uprilim = IA_NOCHANGE; iaInfo->ia_uprilim = IA_NOCHANGE;
iaInfo->ia_nice = IA_NOCHANGE; // iaInfo->ia_nice = IA_NOCHANGE;
iaInfo->ia_mode = IA_NOCHANGE; iaInfo->ia_mode = IA_NOCHANGE;
if (ThreadPriorityVerbose) { if (ThreadPriorityVerbose) {
tty->print_cr ("IA: [%d...%d] %d->%d\n", tty->print_cr ("IA: [%d...%d] %d->%d\n",
......
...@@ -991,6 +991,12 @@ class CommandLineFlags { ...@@ -991,6 +991,12 @@ class CommandLineFlags {
product(bool, UseXmmI2F, false, \ product(bool, UseXmmI2F, false, \
"Use SSE2 CVTDQ2PS instruction to convert Integer to Float") \ "Use SSE2 CVTDQ2PS instruction to convert Integer to Float") \
\ \
product(bool, UseXMMForArrayCopy, false, \
"Use SSE2 MOVQ instruction for Arraycopy") \
\
product(bool, UseUnalignedLoadStores, false, \
"Use SSE2 MOVDQU instruction for Arraycopy") \
\
product(intx, FieldsAllocationStyle, 1, \ product(intx, FieldsAllocationStyle, 1, \
"0 - type based with oops first, 1 - with oops last") \ "0 - type based with oops first, 1 - with oops last") \
\ \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册