提交 2c9a6e9a 编写于 作者: K kvn

6940701: Don't align loops in stubs for Niagara sparc

Summary: Don't align loops in stubs for Niagara sparc since NOPs are expensive.
Reviewed-by: twisti, never
上级 723c65ee
...@@ -2849,7 +2849,7 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) { ...@@ -2849,7 +2849,7 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
void LIR_Assembler::align_backward_branch_target() { void LIR_Assembler::align_backward_branch_target() {
__ align(16); __ align(OptoLoopAlignment);
} }
......
...@@ -60,9 +60,6 @@ define_pd_global(intx, FreqInlineSize, 175); ...@@ -60,9 +60,6 @@ define_pd_global(intx, FreqInlineSize, 175);
define_pd_global(intx, INTPRESSURE, 48); // large register set define_pd_global(intx, INTPRESSURE, 48); // large register set
define_pd_global(intx, InteriorEntryAlignment, 16); // = CodeEntryAlignment define_pd_global(intx, InteriorEntryAlignment, 16); // = CodeEntryAlignment
define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K)); define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K));
// The default setting 16/16 seems to work best.
// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize
define_pd_global(intx, RegisterCostAreaRatio, 12000); define_pd_global(intx, RegisterCostAreaRatio, 12000);
define_pd_global(bool, UseTLAB, true); define_pd_global(bool, UseTLAB, true);
define_pd_global(bool, ResizeTLAB, true); define_pd_global(bool, ResizeTLAB, true);
......
...@@ -40,6 +40,9 @@ define_pd_global(bool, ImplicitNullChecks, true); // Generate code for ...@@ -40,6 +40,9 @@ define_pd_global(bool, ImplicitNullChecks, true); // Generate code for
define_pd_global(bool, UncommonNullCast, true); // Uncommon-trap NULLs past to check cast define_pd_global(bool, UncommonNullCast, true); // Uncommon-trap NULLs past to check cast
define_pd_global(intx, CodeEntryAlignment, 32); define_pd_global(intx, CodeEntryAlignment, 32);
// The default setting 16/16 seems to work best.
// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize
define_pd_global(intx, InlineFrequencyCount, 50); // we can use more inlining on the SPARC define_pd_global(intx, InlineFrequencyCount, 50); // we can use more inlining on the SPARC
define_pd_global(intx, InlineSmallCode, 1500); define_pd_global(intx, InlineSmallCode, 1500);
#ifdef _LP64 #ifdef _LP64
......
...@@ -1148,7 +1148,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1148,7 +1148,7 @@ class StubGenerator: public StubCodeGenerator {
__ andn(from, 7, from); // Align address __ andn(from, 7, from); // Align address
__ ldx(from, 0, O3); __ ldx(from, 0, O3);
__ inc(from, 8); __ inc(from, 8);
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_loop); __ BIND(L_loop);
__ ldx(from, 0, O4); __ ldx(from, 0, O4);
__ deccc(count, count_dec); // Can we do next iteration after this one? __ deccc(count, count_dec); // Can we do next iteration after this one?
...@@ -1220,7 +1220,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1220,7 +1220,7 @@ class StubGenerator: public StubCodeGenerator {
// //
__ andn(end_from, 7, end_from); // Align address __ andn(end_from, 7, end_from); // Align address
__ ldx(end_from, 0, O3); __ ldx(end_from, 0, O3);
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_loop); __ BIND(L_loop);
__ ldx(end_from, -8, O4); __ ldx(end_from, -8, O4);
__ deccc(count, count_dec); // Can we do next iteration after this one? __ deccc(count, count_dec); // Can we do next iteration after this one?
...@@ -1349,7 +1349,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1349,7 +1349,7 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_copy_byte); __ BIND(L_copy_byte);
__ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
__ delayed()->nop(); __ delayed()->nop();
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_byte_loop); __ BIND(L_copy_byte_loop);
__ ldub(from, offset, O3); __ ldub(from, offset, O3);
__ deccc(count); __ deccc(count);
...@@ -1445,7 +1445,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1445,7 +1445,7 @@ class StubGenerator: public StubCodeGenerator {
L_aligned_copy, L_copy_byte); L_aligned_copy, L_copy_byte);
} }
// copy 4 elements (16 bytes) at a time // copy 4 elements (16 bytes) at a time
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_aligned_copy); __ BIND(L_aligned_copy);
__ dec(end_from, 16); __ dec(end_from, 16);
__ ldx(end_from, 8, O3); __ ldx(end_from, 8, O3);
...@@ -1461,7 +1461,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1461,7 +1461,7 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_copy_byte); __ BIND(L_copy_byte);
__ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
__ delayed()->nop(); __ delayed()->nop();
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_byte_loop); __ BIND(L_copy_byte_loop);
__ dec(end_from); __ dec(end_from);
__ dec(end_to); __ dec(end_to);
...@@ -1577,7 +1577,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1577,7 +1577,7 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_copy_2_bytes); __ BIND(L_copy_2_bytes);
__ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
__ delayed()->nop(); __ delayed()->nop();
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_2_bytes_loop); __ BIND(L_copy_2_bytes_loop);
__ lduh(from, offset, O3); __ lduh(from, offset, O3);
__ deccc(count); __ deccc(count);
...@@ -1684,7 +1684,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1684,7 +1684,7 @@ class StubGenerator: public StubCodeGenerator {
L_aligned_copy, L_copy_2_bytes); L_aligned_copy, L_copy_2_bytes);
} }
// copy 4 elements (16 bytes) at a time // copy 4 elements (16 bytes) at a time
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_aligned_copy); __ BIND(L_aligned_copy);
__ dec(end_from, 16); __ dec(end_from, 16);
__ ldx(end_from, 8, O3); __ ldx(end_from, 8, O3);
...@@ -1781,7 +1781,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1781,7 +1781,7 @@ class StubGenerator: public StubCodeGenerator {
// copy with shift 4 elements (16 bytes) at a time // copy with shift 4 elements (16 bytes) at a time
__ dec(count, 4); // The cmp at the beginning guaranty count >= 4 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes); __ BIND(L_copy_16_bytes);
__ ldx(from, 4, O4); __ ldx(from, 4, O4);
__ deccc(count, 4); // Can we do next iteration after this one? __ deccc(count, 4); // Can we do next iteration after this one?
...@@ -1907,7 +1907,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1907,7 +1907,7 @@ class StubGenerator: public StubCodeGenerator {
// to form 2 aligned 8-bytes chunks to store. // to form 2 aligned 8-bytes chunks to store.
// //
__ ldx(end_from, -4, O3); __ ldx(end_from, -4, O3);
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes); __ BIND(L_copy_16_bytes);
__ ldx(end_from, -12, O4); __ ldx(end_from, -12, O4);
__ deccc(count, 4); __ deccc(count, 4);
...@@ -1929,7 +1929,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1929,7 +1929,7 @@ class StubGenerator: public StubCodeGenerator {
__ delayed()->inc(count, 4); __ delayed()->inc(count, 4);
// copy 4 elements (16 bytes) at a time // copy 4 elements (16 bytes) at a time
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_aligned_copy); __ BIND(L_aligned_copy);
__ dec(end_from, 16); __ dec(end_from, 16);
__ ldx(end_from, 8, O3); __ ldx(end_from, 8, O3);
...@@ -2045,7 +2045,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -2045,7 +2045,7 @@ class StubGenerator: public StubCodeGenerator {
__ mov(O3, count); __ mov(O3, count);
__ mov(from, from64); __ mov(from, from64);
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_64_bytes); __ BIND(L_copy_64_bytes);
for( int off = 0; off < 64; off += 16 ) { for( int off = 0; off < 64; off += 16 ) {
__ ldx(from64, off+0, O4); __ ldx(from64, off+0, O4);
...@@ -2065,7 +2065,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -2065,7 +2065,7 @@ class StubGenerator: public StubCodeGenerator {
__ delayed()->add(offset0, 8, offset8); __ delayed()->add(offset0, 8, offset8);
// Copy by 16 bytes chunks // Copy by 16 bytes chunks
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes); __ BIND(L_copy_16_bytes);
__ ldx(from, offset0, O3); __ ldx(from, offset0, O3);
__ ldx(from, offset8, G3); __ ldx(from, offset8, G3);
...@@ -2139,7 +2139,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -2139,7 +2139,7 @@ class StubGenerator: public StubCodeGenerator {
__ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
__ delayed()->sllx(count, LogBytesPerLong, offset8); __ delayed()->sllx(count, LogBytesPerLong, offset8);
__ sub(offset8, 8, offset0); __ sub(offset8, 8, offset0);
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes); __ BIND(L_copy_16_bytes);
__ ldx(from, offset8, O2); __ ldx(from, offset8, O2);
__ ldx(from, offset0, O3); __ ldx(from, offset0, O3);
...@@ -2405,7 +2405,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -2405,7 +2405,7 @@ class StubGenerator: public StubCodeGenerator {
// (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
// (O2 = len; O2 != 0; O2--) --- number of oops *remaining* // (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
// G3, G4, G5 --- current oop, oop.klass, oop.klass.super // G3, G4, G5 --- current oop, oop.klass, oop.klass.super
__ align(16); __ align(OptoLoopAlignment);
__ BIND(store_element); __ BIND(store_element);
__ deccc(G1_remain); // decrement the count __ deccc(G1_remain); // decrement the count
......
...@@ -86,14 +86,14 @@ void VM_Version::initialize() { ...@@ -86,14 +86,14 @@ void VM_Version::initialize() {
if (FLAG_IS_DEFAULT(InteriorEntryAlignment)) { if (FLAG_IS_DEFAULT(InteriorEntryAlignment)) {
FLAG_SET_DEFAULT(InteriorEntryAlignment, 4); FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
} }
if (FLAG_IS_DEFAULT(OptoLoopAlignment)) {
FLAG_SET_DEFAULT(OptoLoopAlignment, 4);
}
if (is_niagara1_plus() && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { if (is_niagara1_plus() && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
// Use smaller prefetch distance on N2 // Use smaller prefetch distance on N2
FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256); FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
} }
#endif #endif
if (FLAG_IS_DEFAULT(OptoLoopAlignment)) {
FLAG_SET_DEFAULT(OptoLoopAlignment, 4);
}
} }
// Use hardware population count instruction if available. // Use hardware population count instruction if available.
......
...@@ -80,7 +80,6 @@ define_pd_global(intx, CodeCacheExpansionSize, 32*K); ...@@ -80,7 +80,6 @@ define_pd_global(intx, CodeCacheExpansionSize, 32*K);
// Ergonomics related flags // Ergonomics related flags
define_pd_global(uint64_t,MaxRAM, 4ULL*G); define_pd_global(uint64_t,MaxRAM, 4ULL*G);
#endif // AMD64 #endif // AMD64
define_pd_global(intx, OptoLoopAlignment, 16);
define_pd_global(intx, RegisterCostAreaRatio, 16000); define_pd_global(intx, RegisterCostAreaRatio, 16000);
// Peephole and CISC spilling both break the graph, and so makes the // Peephole and CISC spilling both break the graph, and so makes the
......
...@@ -45,6 +45,7 @@ define_pd_global(intx, CodeEntryAlignment, 32); ...@@ -45,6 +45,7 @@ define_pd_global(intx, CodeEntryAlignment, 32);
#else #else
define_pd_global(intx, CodeEntryAlignment, 16); define_pd_global(intx, CodeEntryAlignment, 16);
#endif // COMPILER2 #endif // COMPILER2
define_pd_global(intx, OptoLoopAlignment, 16);
define_pd_global(intx, InlineFrequencyCount, 100); define_pd_global(intx, InlineFrequencyCount, 100);
define_pd_global(intx, InlineSmallCode, 1000); define_pd_global(intx, InlineSmallCode, 1000);
......
...@@ -812,7 +812,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -812,7 +812,7 @@ class StubGenerator: public StubCodeGenerator {
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
// Copy 64-byte chunks // Copy 64-byte chunks
__ jmpb(L_copy_64_bytes); __ jmpb(L_copy_64_bytes);
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_64_bytes_loop); __ BIND(L_copy_64_bytes_loop);
if(UseUnalignedLoadStores) { if(UseUnalignedLoadStores) {
...@@ -874,7 +874,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -874,7 +874,7 @@ class StubGenerator: public StubCodeGenerator {
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
// Copy 64-byte chunks // Copy 64-byte chunks
__ jmpb(L_copy_64_bytes); __ jmpb(L_copy_64_bytes);
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_64_bytes_loop); __ BIND(L_copy_64_bytes_loop);
__ movq(mmx0, Address(from, 0)); __ movq(mmx0, Address(from, 0));
__ movq(mmx1, Address(from, 8)); __ movq(mmx1, Address(from, 8));
...@@ -1144,7 +1144,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1144,7 +1144,7 @@ class StubGenerator: public StubCodeGenerator {
__ movl(Address(to, count, sf, 0), rdx); __ movl(Address(to, count, sf, 0), rdx);
__ jmpb(L_copy_8_bytes); __ jmpb(L_copy_8_bytes);
__ align(16); __ align(OptoLoopAlignment);
// Move 8 bytes // Move 8 bytes
__ BIND(L_copy_8_bytes_loop); __ BIND(L_copy_8_bytes_loop);
if (UseXMMForArrayCopy) { if (UseXMMForArrayCopy) {
...@@ -1235,7 +1235,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1235,7 +1235,7 @@ class StubGenerator: public StubCodeGenerator {
} }
} else { } else {
__ jmpb(L_copy_8_bytes); __ jmpb(L_copy_8_bytes);
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_8_bytes_loop); __ BIND(L_copy_8_bytes_loop);
__ fild_d(Address(from, 0)); __ fild_d(Address(from, 0));
__ fistp_d(Address(from, to_from, Address::times_1)); __ fistp_d(Address(from, to_from, Address::times_1));
...@@ -1282,7 +1282,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1282,7 +1282,7 @@ class StubGenerator: public StubCodeGenerator {
__ jmpb(L_copy_8_bytes); __ jmpb(L_copy_8_bytes);
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_copy_8_bytes_loop); __ BIND(L_copy_8_bytes_loop);
if (VM_Version::supports_mmx()) { if (VM_Version::supports_mmx()) {
if (UseXMMForArrayCopy) { if (UseXMMForArrayCopy) {
...@@ -1454,7 +1454,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1454,7 +1454,7 @@ class StubGenerator: public StubCodeGenerator {
// Loop control: // Loop control:
// for (count = -count; count != 0; count++) // for (count = -count; count != 0; count++)
// Base pointers src, dst are biased by 8*count,to last element. // Base pointers src, dst are biased by 8*count,to last element.
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_store_element); __ BIND(L_store_element);
__ movptr(to_element_addr, elem); // store the oop __ movptr(to_element_addr, elem); // store the oop
......
...@@ -871,9 +871,8 @@ class StubGenerator: public StubCodeGenerator { ...@@ -871,9 +871,8 @@ class StubGenerator: public StubCodeGenerator {
} }
address generate_fp_mask(const char *stub_name, int64_t mask) { address generate_fp_mask(const char *stub_name, int64_t mask) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name); StubCodeMark mark(this, "StubRoutines", stub_name);
__ align(16);
address start = __ pc(); address start = __ pc();
__ emit_data64( mask, relocInfo::none ); __ emit_data64( mask, relocInfo::none );
...@@ -1268,7 +1267,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1268,7 +1267,7 @@ class StubGenerator: public StubCodeGenerator {
Label& L_copy_32_bytes, Label& L_copy_8_bytes) { Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
DEBUG_ONLY(__ stop("enter at entry label, not here")); DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop; Label L_loop;
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_loop); __ BIND(L_loop);
if(UseUnalignedLoadStores) { if(UseUnalignedLoadStores) {
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
...@@ -1309,7 +1308,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -1309,7 +1308,7 @@ class StubGenerator: public StubCodeGenerator {
Label& L_copy_32_bytes, Label& L_copy_8_bytes) { Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
DEBUG_ONLY(__ stop("enter at entry label, not here")); DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop; Label L_loop;
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_loop); __ BIND(L_loop);
if(UseUnalignedLoadStores) { if(UseUnalignedLoadStores) {
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
...@@ -2229,7 +2228,7 @@ class StubGenerator: public StubCodeGenerator { ...@@ -2229,7 +2228,7 @@ class StubGenerator: public StubCodeGenerator {
// Loop control: // Loop control:
// for (count = -count; count != 0; count++) // for (count = -count; count != 0; count++)
// Base pointers src, dst are biased by 8*(count-1),to last element. // Base pointers src, dst are biased by 8*(count-1),to last element.
__ align(16); __ align(OptoLoopAlignment);
__ BIND(L_store_element); __ BIND(L_store_element);
__ store_heap_oop(to_element_addr, rax_oop); // store the oop __ store_heap_oop(to_element_addr, rax_oop); // store the oop
......
...@@ -52,9 +52,6 @@ ...@@ -52,9 +52,6 @@
"Code alignment for interior entry points " \ "Code alignment for interior entry points " \
"in generated code (in bytes)") \ "in generated code (in bytes)") \
\ \
product_pd(intx, OptoLoopAlignment, \
"Align inner loops to zero relative to this modulus") \
\
product(intx, MaxLoopPad, (OptoLoopAlignment-1), \ product(intx, MaxLoopPad, (OptoLoopAlignment-1), \
"Align a loop if padding size in bytes is less or equal to this value") \ "Align a loop if padding size in bytes is less or equal to this value") \
\ \
......
...@@ -3110,6 +3110,9 @@ class CommandLineFlags { ...@@ -3110,6 +3110,9 @@ class CommandLineFlags {
develop_pd(intx, CodeEntryAlignment, \ develop_pd(intx, CodeEntryAlignment, \
"Code entry alignment for generated code (in bytes)") \ "Code entry alignment for generated code (in bytes)") \
\ \
product_pd(intx, OptoLoopAlignment, \
"Align inner loops to zero relative to this modulus") \
\
product_pd(uintx, InitialCodeCacheSize, \ product_pd(uintx, InitialCodeCacheSize, \
"Initial code cache size (in bytes)") \ "Initial code cache size (in bytes)") \
\ \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册