提交 2c9a6e9a 编写于 作者: K kvn

6940701: Don't align loops in stubs for Niagara sparc

Summary: Don't align loops in stubs for Niagara sparc since NOPs are expensive.
Reviewed-by: twisti, never
上级 723c65ee
......@@ -2849,7 +2849,7 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
void LIR_Assembler::align_backward_branch_target() {
__ align(16);
__ align(OptoLoopAlignment);
}
......
......@@ -60,9 +60,6 @@ define_pd_global(intx, FreqInlineSize, 175);
define_pd_global(intx, INTPRESSURE, 48); // large register set
define_pd_global(intx, InteriorEntryAlignment, 16); // = CodeEntryAlignment
define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K));
// The default setting 16/16 seems to work best.
// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize
define_pd_global(intx, RegisterCostAreaRatio, 12000);
define_pd_global(bool, UseTLAB, true);
define_pd_global(bool, ResizeTLAB, true);
......
......@@ -40,6 +40,9 @@ define_pd_global(bool, ImplicitNullChecks, true); // Generate code for
define_pd_global(bool, UncommonNullCast, true); // Uncommon-trap NULLs past to check cast
define_pd_global(intx, CodeEntryAlignment, 32);
// The default setting 16/16 seems to work best.
// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize
define_pd_global(intx, InlineFrequencyCount, 50); // we can use more inlining on the SPARC
define_pd_global(intx, InlineSmallCode, 1500);
#ifdef _LP64
......
......@@ -1148,7 +1148,7 @@ class StubGenerator: public StubCodeGenerator {
__ andn(from, 7, from); // Align address
__ ldx(from, 0, O3);
__ inc(from, 8);
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_loop);
__ ldx(from, 0, O4);
__ deccc(count, count_dec); // Can we do next iteration after this one?
......@@ -1220,7 +1220,7 @@ class StubGenerator: public StubCodeGenerator {
//
__ andn(end_from, 7, end_from); // Align address
__ ldx(end_from, 0, O3);
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_loop);
__ ldx(end_from, -8, O4);
__ deccc(count, count_dec); // Can we do next iteration after this one?
......@@ -1349,7 +1349,7 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_copy_byte);
__ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
__ delayed()->nop();
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_byte_loop);
__ ldub(from, offset, O3);
__ deccc(count);
......@@ -1445,7 +1445,7 @@ class StubGenerator: public StubCodeGenerator {
L_aligned_copy, L_copy_byte);
}
// copy 4 elements (16 bytes) at a time
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_aligned_copy);
__ dec(end_from, 16);
__ ldx(end_from, 8, O3);
......@@ -1461,7 +1461,7 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_copy_byte);
__ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
__ delayed()->nop();
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_byte_loop);
__ dec(end_from);
__ dec(end_to);
......@@ -1577,7 +1577,7 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_copy_2_bytes);
__ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
__ delayed()->nop();
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_2_bytes_loop);
__ lduh(from, offset, O3);
__ deccc(count);
......@@ -1684,7 +1684,7 @@ class StubGenerator: public StubCodeGenerator {
L_aligned_copy, L_copy_2_bytes);
}
// copy 4 elements (16 bytes) at a time
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_aligned_copy);
__ dec(end_from, 16);
__ ldx(end_from, 8, O3);
......@@ -1781,7 +1781,7 @@ class StubGenerator: public StubCodeGenerator {
// copy with shift 4 elements (16 bytes) at a time
__ dec(count, 4); // The cmp at the beginning guaranty count >= 4
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes);
__ ldx(from, 4, O4);
__ deccc(count, 4); // Can we do next iteration after this one?
......@@ -1907,7 +1907,7 @@ class StubGenerator: public StubCodeGenerator {
// to form 2 aligned 8-bytes chunks to store.
//
__ ldx(end_from, -4, O3);
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes);
__ ldx(end_from, -12, O4);
__ deccc(count, 4);
......@@ -1929,7 +1929,7 @@ class StubGenerator: public StubCodeGenerator {
__ delayed()->inc(count, 4);
// copy 4 elements (16 bytes) at a time
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_aligned_copy);
__ dec(end_from, 16);
__ ldx(end_from, 8, O3);
......@@ -2045,7 +2045,7 @@ class StubGenerator: public StubCodeGenerator {
__ mov(O3, count);
__ mov(from, from64);
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_64_bytes);
for( int off = 0; off < 64; off += 16 ) {
__ ldx(from64, off+0, O4);
......@@ -2065,7 +2065,7 @@ class StubGenerator: public StubCodeGenerator {
__ delayed()->add(offset0, 8, offset8);
// Copy by 16 bytes chunks
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes);
__ ldx(from, offset0, O3);
__ ldx(from, offset8, G3);
......@@ -2139,7 +2139,7 @@ class StubGenerator: public StubCodeGenerator {
__ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
__ delayed()->sllx(count, LogBytesPerLong, offset8);
__ sub(offset8, 8, offset0);
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes);
__ ldx(from, offset8, O2);
__ ldx(from, offset0, O3);
......@@ -2405,7 +2405,7 @@ class StubGenerator: public StubCodeGenerator {
// (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
// (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
// G3, G4, G5 --- current oop, oop.klass, oop.klass.super
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(store_element);
__ deccc(G1_remain); // decrement the count
......
......@@ -86,14 +86,14 @@ void VM_Version::initialize() {
if (FLAG_IS_DEFAULT(InteriorEntryAlignment)) {
FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
}
if (FLAG_IS_DEFAULT(OptoLoopAlignment)) {
FLAG_SET_DEFAULT(OptoLoopAlignment, 4);
}
if (is_niagara1_plus() && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
// Use smaller prefetch distance on N2
FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
}
#endif
if (FLAG_IS_DEFAULT(OptoLoopAlignment)) {
FLAG_SET_DEFAULT(OptoLoopAlignment, 4);
}
}
// Use hardware population count instruction if available.
......
......@@ -80,7 +80,6 @@ define_pd_global(intx, CodeCacheExpansionSize, 32*K);
// Ergonomics related flags
define_pd_global(uint64_t,MaxRAM, 4ULL*G);
#endif // AMD64
define_pd_global(intx, OptoLoopAlignment, 16);
define_pd_global(intx, RegisterCostAreaRatio, 16000);
// Peephole and CISC spilling both break the graph, and so makes the
......
......@@ -45,6 +45,7 @@ define_pd_global(intx, CodeEntryAlignment, 32);
#else
define_pd_global(intx, CodeEntryAlignment, 16);
#endif // COMPILER2
define_pd_global(intx, OptoLoopAlignment, 16);
define_pd_global(intx, InlineFrequencyCount, 100);
define_pd_global(intx, InlineSmallCode, 1000);
......
......@@ -812,7 +812,7 @@ class StubGenerator: public StubCodeGenerator {
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
// Copy 64-byte chunks
__ jmpb(L_copy_64_bytes);
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_64_bytes_loop);
if(UseUnalignedLoadStores) {
......@@ -874,7 +874,7 @@ class StubGenerator: public StubCodeGenerator {
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
// Copy 64-byte chunks
__ jmpb(L_copy_64_bytes);
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_64_bytes_loop);
__ movq(mmx0, Address(from, 0));
__ movq(mmx1, Address(from, 8));
......@@ -1144,7 +1144,7 @@ class StubGenerator: public StubCodeGenerator {
__ movl(Address(to, count, sf, 0), rdx);
__ jmpb(L_copy_8_bytes);
__ align(16);
__ align(OptoLoopAlignment);
// Move 8 bytes
__ BIND(L_copy_8_bytes_loop);
if (UseXMMForArrayCopy) {
......@@ -1235,7 +1235,7 @@ class StubGenerator: public StubCodeGenerator {
}
} else {
__ jmpb(L_copy_8_bytes);
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_8_bytes_loop);
__ fild_d(Address(from, 0));
__ fistp_d(Address(from, to_from, Address::times_1));
......@@ -1282,7 +1282,7 @@ class StubGenerator: public StubCodeGenerator {
__ jmpb(L_copy_8_bytes);
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_copy_8_bytes_loop);
if (VM_Version::supports_mmx()) {
if (UseXMMForArrayCopy) {
......@@ -1454,7 +1454,7 @@ class StubGenerator: public StubCodeGenerator {
// Loop control:
// for (count = -count; count != 0; count++)
// Base pointers src, dst are biased by 8*count,to last element.
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_store_element);
__ movptr(to_element_addr, elem); // store the oop
......
......@@ -871,9 +871,8 @@ class StubGenerator: public StubCodeGenerator {
}
address generate_fp_mask(const char *stub_name, int64_t mask) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
__ align(16);
address start = __ pc();
__ emit_data64( mask, relocInfo::none );
......@@ -1268,7 +1267,7 @@ class StubGenerator: public StubCodeGenerator {
Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop;
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_loop);
if(UseUnalignedLoadStores) {
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
......@@ -1309,7 +1308,7 @@ class StubGenerator: public StubCodeGenerator {
Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop;
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_loop);
if(UseUnalignedLoadStores) {
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
......@@ -2229,7 +2228,7 @@ class StubGenerator: public StubCodeGenerator {
// Loop control:
// for (count = -count; count != 0; count++)
// Base pointers src, dst are biased by 8*(count-1),to last element.
__ align(16);
__ align(OptoLoopAlignment);
__ BIND(L_store_element);
__ store_heap_oop(to_element_addr, rax_oop); // store the oop
......
......@@ -52,9 +52,6 @@
"Code alignment for interior entry points " \
"in generated code (in bytes)") \
\
product_pd(intx, OptoLoopAlignment, \
"Align inner loops to zero relative to this modulus") \
\
product(intx, MaxLoopPad, (OptoLoopAlignment-1), \
"Align a loop if padding size in bytes is less or equal to this value") \
\
......
......@@ -3110,6 +3110,9 @@ class CommandLineFlags {
develop_pd(intx, CodeEntryAlignment, \
"Code entry alignment for generated code (in bytes)") \
\
product_pd(intx, OptoLoopAlignment, \
"Align inner loops to zero relative to this modulus") \
\
product_pd(uintx, InitialCodeCacheSize, \
"Initial code cache size (in bytes)") \
\
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册